Upload folder: outputs/Qwen3-8B
Browse files- .gitattributes +1 -0
- outputs/Qwen3-8B/w4a4/exp/config.json +45 -0
- outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth +3 -0
- outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth +3 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162158.txt +58 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162240.txt +59 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162627.txt +61 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162858.txt +59 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_163842.txt +61 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164042.txt +62 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164538.txt +135 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165516.txt +106 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165858.txt +79 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_184025.txt +675 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_195354.txt +680 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260109_092702.txt +680 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_062728.txt +63 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_063624.txt +63 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_155601.txt +65 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_160154.txt +70 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_163532.txt +68 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173005.txt +68 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173513.txt +65 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173832.txt +65 -0
- outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_181953.txt +68 -0
- outputs/Qwen3-8B/w4a4/exp/model-00001-of-00002.safetensors +3 -0
- outputs/Qwen3-8B/w4a4/exp/model-00002-of-00002.safetensors +3 -0
- outputs/Qwen3-8B/w4a4/exp/model.safetensors.index.json +0 -0
- outputs/Qwen3-8B/w4a4/exp/quantization_config.json +7 -0
- outputs/Qwen3-8B/w4a4/exp/tokenizer.json +3 -0
- outputs/Qwen3-8B/w4a4/exp/tokenizer_config.json +239 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
outputs/Qwen3-8B/w4a4/exp/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
outputs/Qwen3-8B/w4a4/exp/config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3FlatQuantForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 4096,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 12288,
|
| 14 |
+
"max_position_embeddings": 40960,
|
| 15 |
+
"max_window_layers": 36,
|
| 16 |
+
"model_type": "qwen3",
|
| 17 |
+
"num_attention_heads": 32,
|
| 18 |
+
"num_hidden_layers": 36,
|
| 19 |
+
"num_key_value_heads": 8,
|
| 20 |
+
"rms_norm_eps": 1e-06,
|
| 21 |
+
"rope_scaling": null,
|
| 22 |
+
"rope_theta": 1000000,
|
| 23 |
+
"sliding_window": null,
|
| 24 |
+
"tie_word_embeddings": false,
|
| 25 |
+
"torch_dtype": "bfloat16",
|
| 26 |
+
"transformers_version": "4.51.0",
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_sliding_window": false,
|
| 29 |
+
"vocab_size": 151936,
|
| 30 |
+
"fake_quant_config": {
|
| 31 |
+
"w_bits": 4,
|
| 32 |
+
"a_bits": 4,
|
| 33 |
+
"a_asym": false,
|
| 34 |
+
"w_asym": false,
|
| 35 |
+
"k_bits": 16,
|
| 36 |
+
"k_asym": false,
|
| 37 |
+
"k_groupsize": -1,
|
| 38 |
+
"v_bits": 16,
|
| 39 |
+
"v_asym": false,
|
| 40 |
+
"v_groupsize": -1,
|
| 41 |
+
"lwc": true,
|
| 42 |
+
"lac": true,
|
| 43 |
+
"direct_inv": false
|
| 44 |
+
}
|
| 45 |
+
}
|
outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea4be9a10ff9947ffff53ff39806eb983423776916352323e46f9eec6ed60dbc
|
| 3 |
+
size 31799687
|
outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5aeb255e7df9875349b4ebdf66b04278604545d1a123fd1979a142ba3e487f4
|
| 3 |
+
size 32004907
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162158.txt
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:21:58 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:21:58 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen3/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:21:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162240.txt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:22:40 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:22:40 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:22:40 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:22:41 root] (model_utils.py 92): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162627.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:26:27 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:26:27 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:26:27 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:26:28 root] (model_utils.py 92): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:26:51 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:26:56 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162858.txt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:28:58 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:28:58 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:28:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:28:58 root] (model_utils.py 81): ERROR Qwen3 model is not available. Error: attempted relative import with no known parent package
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_163842.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:38:42 root] (args_utils.py 158): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:38:42 root] (args_utils.py 159): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:38:42 root] (args_utils.py 160): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:38:43 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:39:04 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:39:09 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164042.txt
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:40:42 root] (args_utils.py 158): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:40:42 root] (args_utils.py 159): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:40:42 root] (args_utils.py 160): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:40:43 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:41:03 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:41:08 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 16:41:11 root] (train_utils.py 99): INFO ========= Layer 0 =========
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164538.txt
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:45:38 root] (args_utils.py 158): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:45:38 root] (args_utils.py 159): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:45:38 root] (args_utils.py 160): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:45:39 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:45:59 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:46:04 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 16:46:06 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-08 16:46:14 root] (train_utils.py 181): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.177632s, mse: 0.75320721
|
| 64 |
+
[2026-01-08 16:46:18 root] (train_utils.py 181): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.951230s, mse: 0.69738150
|
| 65 |
+
[2026-01-08 16:46:22 root] (train_utils.py 181): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.918307s, mse: 0.57322526
|
| 66 |
+
[2026-01-08 16:46:26 root] (train_utils.py 181): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.922915s, mse: 0.53385043
|
| 67 |
+
[2026-01-08 16:46:30 root] (train_utils.py 181): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.897071s, mse: 0.52587473
|
| 68 |
+
[2026-01-08 16:46:34 root] (train_utils.py 181): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.904243s, mse: 0.52103043
|
| 69 |
+
[2026-01-08 16:46:38 root] (train_utils.py 181): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.906013s, mse: 0.51764816
|
| 70 |
+
[2026-01-08 16:46:42 root] (train_utils.py 181): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.934398s, mse: 0.51576799
|
| 71 |
+
[2026-01-08 16:46:46 root] (train_utils.py 181): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.926790s, mse: 0.51471919
|
| 72 |
+
[2026-01-08 16:46:50 root] (train_utils.py 181): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.909371s, mse: 0.51408356
|
| 73 |
+
[2026-01-08 16:46:54 root] (train_utils.py 181): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.908848s, mse: 0.51356357
|
| 74 |
+
[2026-01-08 16:46:57 root] (train_utils.py 181): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.920370s, mse: 0.51325279
|
| 75 |
+
[2026-01-08 16:47:01 root] (train_utils.py 181): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.914565s, mse: 0.51308525
|
| 76 |
+
[2026-01-08 16:47:05 root] (train_utils.py 181): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.907758s, mse: 0.51298046
|
| 77 |
+
[2026-01-08 16:47:09 root] (train_utils.py 181): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.903909s, mse: 0.51291251
|
| 78 |
+
[2026-01-08 16:47:10 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-08 16:47:10 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
| 80 |
+
[2026-01-08 16:47:17 root] (train_utils.py 181): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.529751s, mse: 1.26446903
|
| 81 |
+
[2026-01-08 16:47:21 root] (train_utils.py 181): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.905758s, mse: 1.20416105
|
| 82 |
+
[2026-01-08 16:47:25 root] (train_utils.py 181): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.919909s, mse: 1.32050300
|
| 83 |
+
[2026-01-08 16:47:29 root] (train_utils.py 181): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.911430s, mse: 1.18387961
|
| 84 |
+
[2026-01-08 16:47:33 root] (train_utils.py 181): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.908307s, mse: 1.16144323
|
| 85 |
+
[2026-01-08 16:47:37 root] (train_utils.py 181): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.900176s, mse: 1.14692831
|
| 86 |
+
[2026-01-08 16:47:41 root] (train_utils.py 181): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.908509s, mse: 1.13803911
|
| 87 |
+
[2026-01-08 16:47:44 root] (train_utils.py 181): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.990242s, mse: 1.13248944
|
| 88 |
+
[2026-01-08 16:47:48 root] (train_utils.py 181): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.922920s, mse: 1.12851596
|
| 89 |
+
[2026-01-08 16:47:52 root] (train_utils.py 181): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.909056s, mse: 1.12513459
|
| 90 |
+
[2026-01-08 16:47:56 root] (train_utils.py 181): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.911661s, mse: 1.12304866
|
| 91 |
+
[2026-01-08 16:48:00 root] (train_utils.py 181): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.920923s, mse: 1.12149227
|
| 92 |
+
[2026-01-08 16:48:04 root] (train_utils.py 181): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.930339s, mse: 1.12047637
|
| 93 |
+
[2026-01-08 16:48:08 root] (train_utils.py 181): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.923637s, mse: 1.12013018
|
| 94 |
+
[2026-01-08 16:48:12 root] (train_utils.py 181): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.901148s, mse: 1.12002420
|
| 95 |
+
[2026-01-08 16:48:12 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 96 |
+
[2026-01-08 16:48:13 root] (train_utils.py 108): INFO ========= Layer 2 =========
|
| 97 |
+
[2026-01-08 16:48:20 root] (train_utils.py 181): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 4.564884s, mse: 2.01507950
|
| 98 |
+
[2026-01-08 16:48:24 root] (train_utils.py 181): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.932034s, mse: 1.94718516
|
| 99 |
+
[2026-01-08 16:48:28 root] (train_utils.py 181): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.938544s, mse: 1.88504732
|
| 100 |
+
[2026-01-08 16:48:31 root] (train_utils.py 181): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.911523s, mse: 1.81488454
|
| 101 |
+
[2026-01-08 16:48:35 root] (train_utils.py 181): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.927961s, mse: 1.85724211
|
| 102 |
+
[2026-01-08 16:48:39 root] (train_utils.py 181): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.927059s, mse: 2.01470947
|
| 103 |
+
[2026-01-08 16:48:43 root] (train_utils.py 181): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.914749s, mse: 1.76976871
|
| 104 |
+
[2026-01-08 16:48:47 root] (train_utils.py 181): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.930597s, mse: 1.76208174
|
| 105 |
+
[2026-01-08 16:48:51 root] (train_utils.py 181): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.942161s, mse: 1.75805795
|
| 106 |
+
[2026-01-08 16:48:55 root] (train_utils.py 181): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.925775s, mse: 1.75492477
|
| 107 |
+
[2026-01-08 16:48:59 root] (train_utils.py 181): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.927086s, mse: 1.75177002
|
| 108 |
+
[2026-01-08 16:49:03 root] (train_utils.py 181): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.916509s, mse: 1.74904215
|
| 109 |
+
[2026-01-08 16:49:07 root] (train_utils.py 181): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.925464s, mse: 1.74770069
|
| 110 |
+
[2026-01-08 16:49:11 root] (train_utils.py 181): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.931881s, mse: 1.74607742
|
| 111 |
+
[2026-01-08 16:49:15 root] (train_utils.py 181): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.918606s, mse: 1.74535310
|
| 112 |
+
[2026-01-08 16:49:15 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 113 |
+
[2026-01-08 16:49:16 root] (train_utils.py 108): INFO ========= Layer 3 =========
|
| 114 |
+
[2026-01-08 16:49:22 root] (train_utils.py 181): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 4.469205s, mse: 3.37112665
|
| 115 |
+
[2026-01-08 16:49:26 root] (train_utils.py 181): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.908183s, mse: 3.28124046
|
| 116 |
+
[2026-01-08 16:49:30 root] (train_utils.py 181): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.881730s, mse: 3.07249451
|
| 117 |
+
[2026-01-08 16:49:34 root] (train_utils.py 181): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.889869s, mse: 2.85796380
|
| 118 |
+
[2026-01-08 16:49:38 root] (train_utils.py 181): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.935233s, mse: 2.88136601
|
| 119 |
+
[2026-01-08 16:49:42 root] (train_utils.py 181): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.948762s, mse: 3.00021911
|
| 120 |
+
[2026-01-08 16:49:46 root] (train_utils.py 181): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.965647s, mse: 2.90009570
|
| 121 |
+
[2026-01-08 16:49:50 root] (train_utils.py 181): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.905676s, mse: 2.84187627
|
| 122 |
+
[2026-01-08 16:49:54 root] (train_utils.py 181): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.916992s, mse: 3.88529181
|
| 123 |
+
[2026-01-08 16:49:58 root] (train_utils.py 181): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.918374s, mse: 2.84166765
|
| 124 |
+
[2026-01-08 16:50:02 root] (train_utils.py 181): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.926870s, mse: 2.83363008
|
| 125 |
+
[2026-01-08 16:50:05 root] (train_utils.py 181): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.899929s, mse: 2.82961488
|
| 126 |
+
[2026-01-08 16:50:09 root] (train_utils.py 181): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.909861s, mse: 2.82789564
|
| 127 |
+
[2026-01-08 16:50:13 root] (train_utils.py 181): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.883598s, mse: 2.82704329
|
| 128 |
+
[2026-01-08 16:50:17 root] (train_utils.py 181): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.906606s, mse: 2.82679415
|
| 129 |
+
[2026-01-08 16:50:18 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 130 |
+
[2026-01-08 16:50:18 root] (train_utils.py 108): INFO ========= Layer 4 =========
|
| 131 |
+
[2026-01-08 16:50:25 root] (train_utils.py 181): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 4.617949s, mse: 5.87795258
|
| 132 |
+
[2026-01-08 16:50:29 root] (train_utils.py 181): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.927140s, mse: 5.58140898
|
| 133 |
+
[2026-01-08 16:50:33 root] (train_utils.py 181): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.943356s, mse: 5.40157461
|
| 134 |
+
[2026-01-08 16:50:37 root] (train_utils.py 181): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.934676s, mse: 4.97706127
|
| 135 |
+
[2026-01-08 16:50:41 root] (train_utils.py 181): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.936047s, mse: 4.83699369
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165516.txt
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:55:16 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:55:16 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': -1,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': -1,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:55:16 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:55:17 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:55:38 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:55:42 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 16:55:45 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-08 16:55:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.095334s, mse: 0.02583671
|
| 64 |
+
[2026-01-08 16:55:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.934147s, mse: 0.01396359
|
| 65 |
+
[2026-01-08 16:56:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.939300s, mse: 0.01044113
|
| 66 |
+
[2026-01-08 16:56:05 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.908403s, mse: 0.00969208
|
| 67 |
+
[2026-01-08 16:56:09 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.915706s, mse: 0.00940374
|
| 68 |
+
[2026-01-08 16:56:13 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.912972s, mse: 0.00924401
|
| 69 |
+
[2026-01-08 16:56:17 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.921335s, mse: 0.00912234
|
| 70 |
+
[2026-01-08 16:56:21 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.932727s, mse: 0.00903957
|
| 71 |
+
[2026-01-08 16:56:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 4.433334s, mse: 0.00895381
|
| 72 |
+
[2026-01-08 16:56:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.907690s, mse: 0.00888840
|
| 73 |
+
[2026-01-08 16:56:33 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.904034s, mse: 0.00881676
|
| 74 |
+
[2026-01-08 16:56:37 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.927204s, mse: 0.00877623
|
| 75 |
+
[2026-01-08 16:56:41 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.925085s, mse: 0.00874014
|
| 76 |
+
[2026-01-08 16:56:45 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.916118s, mse: 0.00871035
|
| 77 |
+
[2026-01-08 16:56:48 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.921848s, mse: 0.00869927
|
| 78 |
+
[2026-01-08 16:56:49 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-08 16:56:49 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
| 80 |
+
[2026-01-08 16:56:56 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.588893s, mse: 0.01656580
|
| 81 |
+
[2026-01-08 16:57:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 4.451948s, mse: 0.00616941
|
| 82 |
+
[2026-01-08 16:57:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.968735s, mse: 0.00441587
|
| 83 |
+
[2026-01-08 16:57:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 4.437059s, mse: 0.00399985
|
| 84 |
+
[2026-01-08 16:57:13 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 4.038846s, mse: 0.00383769
|
| 85 |
+
[2026-01-08 16:57:17 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.925880s, mse: 0.00373424
|
| 86 |
+
[2026-01-08 16:57:21 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.923243s, mse: 0.00368217
|
| 87 |
+
[2026-01-08 16:57:25 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.938122s, mse: 0.00363120
|
| 88 |
+
[2026-01-08 16:57:29 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.932140s, mse: 0.00357347
|
| 89 |
+
[2026-01-08 16:57:33 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.947205s, mse: 0.00352856
|
| 90 |
+
[2026-01-08 16:57:37 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.939108s, mse: 0.00350962
|
| 91 |
+
[2026-01-08 16:57:41 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 4.007798s, mse: 0.00346109
|
| 92 |
+
[2026-01-08 16:57:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.940766s, mse: 0.00342262
|
| 93 |
+
[2026-01-08 16:57:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.952263s, mse: 0.00341267
|
| 94 |
+
[2026-01-08 16:57:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.938083s, mse: 0.00340322
|
| 95 |
+
[2026-01-08 16:57:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 96 |
+
[2026-01-08 16:57:54 root] (train_utils.py 108): INFO ========= Layer 2 =========
|
| 97 |
+
[2026-01-08 16:58:00 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 4.587668s, mse: 0.03411270
|
| 98 |
+
[2026-01-08 16:58:04 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.952559s, mse: 0.00820368
|
| 99 |
+
[2026-01-08 16:58:08 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.936083s, mse: 0.00562381
|
| 100 |
+
[2026-01-08 16:58:12 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.948321s, mse: 0.00503509
|
| 101 |
+
[2026-01-08 16:58:16 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.930907s, mse: 0.00482035
|
| 102 |
+
[2026-01-08 16:58:20 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.918012s, mse: 0.00470118
|
| 103 |
+
[2026-01-08 16:58:24 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.927189s, mse: 0.00463578
|
| 104 |
+
[2026-01-08 16:58:28 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.935595s, mse: 0.00459153
|
| 105 |
+
[2026-01-08 16:58:32 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.921202s, mse: 0.00453731
|
| 106 |
+
[2026-01-08 16:58:36 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.939279s, mse: 0.00450525
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165858.txt
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 16:58:58 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 16:58:58 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 16:58:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 16:58:59 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 16:59:21 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 16:59:26 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 16:59:29 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-08 16:59:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 6.397410s, mse: 0.01574295
|
| 64 |
+
[2026-01-08 16:59:44 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 4.447838s, mse: 0.01115426
|
| 65 |
+
[2026-01-08 16:59:48 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 4.593246s, mse: 0.00938093
|
| 66 |
+
[2026-01-08 16:59:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 4.590999s, mse: 0.00881439
|
| 67 |
+
[2026-01-08 16:59:58 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 4.522616s, mse: 0.00857142
|
| 68 |
+
[2026-01-08 17:00:02 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 4.538843s, mse: 0.00849318
|
| 69 |
+
[2026-01-08 17:00:07 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 4.488467s, mse: 0.00832680
|
| 70 |
+
[2026-01-08 17:00:11 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 4.497394s, mse: 0.00828776
|
| 71 |
+
[2026-01-08 17:00:16 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 4.503586s, mse: 0.00818714
|
| 72 |
+
[2026-01-08 17:00:20 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 4.574569s, mse: 0.00813103
|
| 73 |
+
[2026-01-08 17:00:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 4.571058s, mse: 0.00808381
|
| 74 |
+
[2026-01-08 17:00:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 4.520602s, mse: 0.00804329
|
| 75 |
+
[2026-01-08 17:00:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 4.617018s, mse: 0.00799941
|
| 76 |
+
[2026-01-08 17:00:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 4.615621s, mse: 0.00795571
|
| 77 |
+
[2026-01-08 17:00:43 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 4.499076s, mse: 0.00794016
|
| 78 |
+
[2026-01-08 17:00:44 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-08 17:00:44 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_184025.txt
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 18:40:25 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 18:40:25 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': True,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 18:40:25 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 18:40:28 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 18:40:44 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 18:40:53 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 18:40:57 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-08 18:41:07 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 6.342341s, mse: 0.01574295
|
| 64 |
+
[2026-01-08 18:41:11 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.926945s, mse: 0.01115426
|
| 65 |
+
[2026-01-08 18:41:14 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.866743s, mse: 0.00938093
|
| 66 |
+
[2026-01-08 18:41:18 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.864751s, mse: 0.00881439
|
| 67 |
+
[2026-01-08 18:41:22 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.862903s, mse: 0.00857142
|
| 68 |
+
[2026-01-08 18:41:26 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.863009s, mse: 0.00849318
|
| 69 |
+
[2026-01-08 18:41:30 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.868398s, mse: 0.00832680
|
| 70 |
+
[2026-01-08 18:41:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.862813s, mse: 0.00828776
|
| 71 |
+
[2026-01-08 18:41:38 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.860975s, mse: 0.00818714
|
| 72 |
+
[2026-01-08 18:41:41 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.861234s, mse: 0.00813103
|
| 73 |
+
[2026-01-08 18:41:45 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.870577s, mse: 0.00808381
|
| 74 |
+
[2026-01-08 18:41:49 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.860093s, mse: 0.00804329
|
| 75 |
+
[2026-01-08 18:41:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.875212s, mse: 0.00799941
|
| 76 |
+
[2026-01-08 18:41:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.869323s, mse: 0.00795571
|
| 77 |
+
[2026-01-08 18:42:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.871119s, mse: 0.00794016
|
| 78 |
+
[2026-01-08 18:42:01 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-08 18:42:02 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
| 80 |
+
[2026-01-08 18:42:11 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 5.915874s, mse: 0.00892038
|
| 81 |
+
[2026-01-08 18:42:15 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.927274s, mse: 0.00479663
|
| 82 |
+
[2026-01-08 18:42:18 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.867823s, mse: 0.00384854
|
| 83 |
+
[2026-01-08 18:42:22 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.867914s, mse: 0.00355465
|
| 84 |
+
[2026-01-08 18:42:26 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.868877s, mse: 0.00343135
|
| 85 |
+
[2026-01-08 18:42:30 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.867419s, mse: 0.00337971
|
| 86 |
+
[2026-01-08 18:42:34 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.868710s, mse: 0.00336636
|
| 87 |
+
[2026-01-08 18:42:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.870240s, mse: 0.00329515
|
| 88 |
+
[2026-01-08 18:42:42 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.870138s, mse: 0.00326379
|
| 89 |
+
[2026-01-08 18:42:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.872112s, mse: 0.00321724
|
| 90 |
+
[2026-01-08 18:42:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.867391s, mse: 0.00316591
|
| 91 |
+
[2026-01-08 18:42:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.864894s, mse: 0.00313276
|
| 92 |
+
[2026-01-08 18:42:57 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.868528s, mse: 0.00310469
|
| 93 |
+
[2026-01-08 18:43:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.865803s, mse: 0.00308243
|
| 94 |
+
[2026-01-08 18:43:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.869683s, mse: 0.00306749
|
| 95 |
+
[2026-01-08 18:43:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 96 |
+
[2026-01-08 18:43:06 root] (train_utils.py 108): INFO ========= Layer 2 =========
|
| 97 |
+
[2026-01-08 18:43:14 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 5.276760s, mse: 0.01750460
|
| 98 |
+
[2026-01-08 18:43:18 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.925459s, mse: 0.00626545
|
| 99 |
+
[2026-01-08 18:43:22 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.866118s, mse: 0.00494380
|
| 100 |
+
[2026-01-08 18:43:26 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.866142s, mse: 0.00453308
|
| 101 |
+
[2026-01-08 18:43:29 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.870597s, mse: 0.00439964
|
| 102 |
+
[2026-01-08 18:43:33 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.871755s, mse: 0.00429795
|
| 103 |
+
[2026-01-08 18:43:37 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.868093s, mse: 0.00425246
|
| 104 |
+
[2026-01-08 18:43:41 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.871027s, mse: 0.00420888
|
| 105 |
+
[2026-01-08 18:43:45 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.869353s, mse: 0.00415287
|
| 106 |
+
[2026-01-08 18:43:49 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.868892s, mse: 0.00411024
|
| 107 |
+
[2026-01-08 18:43:53 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.875470s, mse: 0.00407672
|
| 108 |
+
[2026-01-08 18:43:57 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.868368s, mse: 0.00404750
|
| 109 |
+
[2026-01-08 18:44:00 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.870399s, mse: 0.00401742
|
| 110 |
+
[2026-01-08 18:44:04 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.868037s, mse: 0.00398090
|
| 111 |
+
[2026-01-08 18:44:08 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.879722s, mse: 0.00397130
|
| 112 |
+
[2026-01-08 18:44:09 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 113 |
+
[2026-01-08 18:44:09 root] (train_utils.py 108): INFO ========= Layer 3 =========
|
| 114 |
+
[2026-01-08 18:44:17 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 5.277266s, mse: 0.02308414
|
| 115 |
+
[2026-01-08 18:44:21 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.869237s, mse: 0.01333557
|
| 116 |
+
[2026-01-08 18:44:25 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.873277s, mse: 0.01099337
|
| 117 |
+
[2026-01-08 18:44:29 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.871180s, mse: 0.01028412
|
| 118 |
+
[2026-01-08 18:44:33 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.871590s, mse: 0.01000082
|
| 119 |
+
[2026-01-08 18:44:36 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.869860s, mse: 0.00980410
|
| 120 |
+
[2026-01-08 18:44:40 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.869903s, mse: 0.00969286
|
| 121 |
+
[2026-01-08 18:44:44 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.872837s, mse: 0.00956387
|
| 122 |
+
[2026-01-08 18:44:48 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.871547s, mse: 0.00946260
|
| 123 |
+
[2026-01-08 18:44:52 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.866354s, mse: 0.00937346
|
| 124 |
+
[2026-01-08 18:44:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.870575s, mse: 0.00926330
|
| 125 |
+
[2026-01-08 18:45:00 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.865113s, mse: 0.00916464
|
| 126 |
+
[2026-01-08 18:45:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.873380s, mse: 0.00907166
|
| 127 |
+
[2026-01-08 18:45:07 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.872867s, mse: 0.00904066
|
| 128 |
+
[2026-01-08 18:45:11 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.859612s, mse: 0.00900416
|
| 129 |
+
[2026-01-08 18:45:12 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 130 |
+
[2026-01-08 18:45:13 root] (train_utils.py 108): INFO ========= Layer 4 =========
|
| 131 |
+
[2026-01-08 18:45:20 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 5.013705s, mse: 0.06576648
|
| 132 |
+
[2026-01-08 18:45:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.866042s, mse: 0.03741666
|
| 133 |
+
[2026-01-08 18:45:28 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.867324s, mse: 0.03053248
|
| 134 |
+
[2026-01-08 18:45:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.867816s, mse: 0.02855516
|
| 135 |
+
[2026-01-08 18:45:35 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.874113s, mse: 0.02790034
|
| 136 |
+
[2026-01-08 18:45:39 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 3.864290s, mse: 0.02746365
|
| 137 |
+
[2026-01-08 18:45:43 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.874358s, mse: 0.02716962
|
| 138 |
+
[2026-01-08 18:45:47 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.866786s, mse: 0.02687641
|
| 139 |
+
[2026-01-08 18:45:51 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.866034s, mse: 0.02662238
|
| 140 |
+
[2026-01-08 18:45:55 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 3.867874s, mse: 0.02643147
|
| 141 |
+
[2026-01-08 18:45:59 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 3.875141s, mse: 0.02624781
|
| 142 |
+
[2026-01-08 18:46:03 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 3.867632s, mse: 0.02604026
|
| 143 |
+
[2026-01-08 18:46:06 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 3.868149s, mse: 0.02585863
|
| 144 |
+
[2026-01-08 18:46:10 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 3.867380s, mse: 0.02578292
|
| 145 |
+
[2026-01-08 18:46:14 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 3.871570s, mse: 0.02572995
|
| 146 |
+
[2026-01-08 18:46:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 147 |
+
[2026-01-08 18:46:15 root] (train_utils.py 108): INFO ========= Layer 5 =========
|
| 148 |
+
[2026-01-08 18:46:23 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.259219s, mse: 0.13743916
|
| 149 |
+
[2026-01-08 18:46:27 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 3.869206s, mse: 0.08057592
|
| 150 |
+
[2026-01-08 18:46:31 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 3.877397s, mse: 0.06617787
|
| 151 |
+
[2026-01-08 18:46:35 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 3.875794s, mse: 0.06287611
|
| 152 |
+
[2026-01-08 18:46:38 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 3.874708s, mse: 0.06213523
|
| 153 |
+
[2026-01-08 18:46:42 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.873983s, mse: 0.06160403
|
| 154 |
+
[2026-01-08 18:46:46 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.868648s, mse: 0.06119698
|
| 155 |
+
[2026-01-08 18:46:50 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.871142s, mse: 0.06094177
|
| 156 |
+
[2026-01-08 18:46:54 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.869253s, mse: 0.06060794
|
| 157 |
+
[2026-01-08 18:46:58 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.880099s, mse: 0.06020888
|
| 158 |
+
[2026-01-08 18:47:02 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.872482s, mse: 0.05995716
|
| 159 |
+
[2026-01-08 18:47:06 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.875441s, mse: 0.05978661
|
| 160 |
+
[2026-01-08 18:47:09 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.871503s, mse: 0.05955682
|
| 161 |
+
[2026-01-08 18:47:13 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.870681s, mse: 0.05938030
|
| 162 |
+
[2026-01-08 18:47:17 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.873357s, mse: 0.05934311
|
| 163 |
+
[2026-01-08 18:47:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 164 |
+
[2026-01-08 18:47:18 root] (train_utils.py 108): INFO ========= Layer 6 =========
|
| 165 |
+
[2026-01-08 18:47:26 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 5.163689s, mse: 1.86451793
|
| 166 |
+
[2026-01-08 18:47:30 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.873172s, mse: 0.35658583
|
| 167 |
+
[2026-01-08 18:47:34 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.877536s, mse: 0.32737118
|
| 168 |
+
[2026-01-08 18:47:38 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.867511s, mse: 0.28929594
|
| 169 |
+
[2026-01-08 18:47:41 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.878428s, mse: 0.24128482
|
| 170 |
+
[2026-01-08 18:47:45 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.864721s, mse: 0.21027605
|
| 171 |
+
[2026-01-08 18:47:49 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.872345s, mse: 0.25483868
|
| 172 |
+
[2026-01-08 18:47:53 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.869268s, mse: 0.23871142
|
| 173 |
+
[2026-01-08 18:47:57 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.874523s, mse: 0.21885920
|
| 174 |
+
[2026-01-08 18:48:01 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.866698s, mse: 0.20672695
|
| 175 |
+
[2026-01-08 18:48:05 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.875635s, mse: 0.20202750
|
| 176 |
+
[2026-01-08 18:48:09 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.868720s, mse: 0.17932597
|
| 177 |
+
[2026-01-08 18:48:12 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.877253s, mse: 0.20257902
|
| 178 |
+
[2026-01-08 18:48:16 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.873001s, mse: 0.20667967
|
| 179 |
+
[2026-01-08 18:48:20 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.868689s, mse: 0.16777667
|
| 180 |
+
[2026-01-08 18:48:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 181 |
+
[2026-01-08 18:48:21 root] (train_utils.py 108): INFO ========= Layer 7 =========
|
| 182 |
+
[2026-01-08 18:48:29 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 5.284689s, mse: 0.23462753
|
| 183 |
+
[2026-01-08 18:48:33 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.870807s, mse: 0.14976017
|
| 184 |
+
[2026-01-08 18:48:37 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.875834s, mse: 0.12312289
|
| 185 |
+
[2026-01-08 18:48:41 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.870820s, mse: 0.11779824
|
| 186 |
+
[2026-01-08 18:48:44 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.873519s, mse: 0.11621600
|
| 187 |
+
[2026-01-08 18:48:48 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.879462s, mse: 0.11538153
|
| 188 |
+
[2026-01-08 18:48:52 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.869727s, mse: 0.11461711
|
| 189 |
+
[2026-01-08 18:48:56 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.870542s, mse: 0.11396322
|
| 190 |
+
[2026-01-08 18:49:00 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.872142s, mse: 0.11346199
|
| 191 |
+
[2026-01-08 18:49:04 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.875134s, mse: 0.11303829
|
| 192 |
+
[2026-01-08 18:49:08 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 3.868972s, mse: 0.11244514
|
| 193 |
+
[2026-01-08 18:49:12 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 3.871574s, mse: 0.11193727
|
| 194 |
+
[2026-01-08 18:49:15 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 3.900703s, mse: 0.11167257
|
| 195 |
+
[2026-01-08 18:49:19 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 3.875596s, mse: 0.11139309
|
| 196 |
+
[2026-01-08 18:49:23 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 3.875573s, mse: 0.11127126
|
| 197 |
+
[2026-01-08 18:49:24 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 198 |
+
[2026-01-08 18:49:24 root] (train_utils.py 108): INFO ========= Layer 8 =========
|
| 199 |
+
[2026-01-08 18:49:32 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 5.331074s, mse: 0.31783378
|
| 200 |
+
[2026-01-08 18:49:36 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 4.026908s, mse: 0.21154313
|
| 201 |
+
[2026-01-08 18:49:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 3.870493s, mse: 0.17556834
|
| 202 |
+
[2026-01-08 18:49:44 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 3.876034s, mse: 0.16892871
|
| 203 |
+
[2026-01-08 18:49:48 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 3.871941s, mse: 0.16700211
|
| 204 |
+
[2026-01-08 18:49:51 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 3.867337s, mse: 0.16594610
|
| 205 |
+
[2026-01-08 18:49:55 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 3.865289s, mse: 0.16510613
|
| 206 |
+
[2026-01-08 18:49:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 3.873836s, mse: 0.16456470
|
| 207 |
+
[2026-01-08 18:50:03 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 3.868187s, mse: 0.16401851
|
| 208 |
+
[2026-01-08 18:50:07 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 3.874494s, mse: 0.16352586
|
| 209 |
+
[2026-01-08 18:50:11 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 3.868160s, mse: 0.16331530
|
| 210 |
+
[2026-01-08 18:50:15 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 3.871973s, mse: 0.16285881
|
| 211 |
+
[2026-01-08 18:50:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 3.868864s, mse: 0.16254890
|
| 212 |
+
[2026-01-08 18:50:22 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 3.867946s, mse: 0.16240378
|
| 213 |
+
[2026-01-08 18:50:26 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 3.872199s, mse: 0.16246043
|
| 214 |
+
[2026-01-08 18:50:27 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 215 |
+
[2026-01-08 18:50:28 root] (train_utils.py 108): INFO ========= Layer 9 =========
|
| 216 |
+
[2026-01-08 18:50:35 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 5.142299s, mse: 0.37875688
|
| 217 |
+
[2026-01-08 18:50:39 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 3.869491s, mse: 0.25363240
|
| 218 |
+
[2026-01-08 18:50:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 3.869034s, mse: 0.21064380
|
| 219 |
+
[2026-01-08 18:50:47 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 3.872067s, mse: 0.20179385
|
| 220 |
+
[2026-01-08 18:50:51 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 3.880042s, mse: 0.19936548
|
| 221 |
+
[2026-01-08 18:50:55 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.866734s, mse: 0.19817175
|
| 222 |
+
[2026-01-08 18:50:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 3.873047s, mse: 0.19703594
|
| 223 |
+
[2026-01-08 18:51:02 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 3.874506s, mse: 0.19626960
|
| 224 |
+
[2026-01-08 18:51:06 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 3.874818s, mse: 0.19534998
|
| 225 |
+
[2026-01-08 18:51:10 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 3.871551s, mse: 0.19473058
|
| 226 |
+
[2026-01-08 18:51:14 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 3.871144s, mse: 0.19404019
|
| 227 |
+
[2026-01-08 18:51:18 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 3.871020s, mse: 0.19356999
|
| 228 |
+
[2026-01-08 18:51:22 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 3.869848s, mse: 0.19326007
|
| 229 |
+
[2026-01-08 18:51:26 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 3.865149s, mse: 0.19282311
|
| 230 |
+
[2026-01-08 18:51:30 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 3.872090s, mse: 0.19267595
|
| 231 |
+
[2026-01-08 18:51:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 232 |
+
[2026-01-08 18:51:31 root] (train_utils.py 108): INFO ========= Layer 10 =========
|
| 233 |
+
[2026-01-08 18:51:38 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 5.212747s, mse: 0.44592521
|
| 234 |
+
[2026-01-08 18:51:42 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.870292s, mse: 0.28058022
|
| 235 |
+
[2026-01-08 18:51:46 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.876544s, mse: 0.22870731
|
| 236 |
+
[2026-01-08 18:51:50 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 3.870140s, mse: 0.21672769
|
| 237 |
+
[2026-01-08 18:51:54 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 3.867480s, mse: 0.21354958
|
| 238 |
+
[2026-01-08 18:51:58 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 3.874588s, mse: 0.21149486
|
| 239 |
+
[2026-01-08 18:52:02 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 3.870549s, mse: 0.21045262
|
| 240 |
+
[2026-01-08 18:52:05 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 3.872663s, mse: 0.20926467
|
| 241 |
+
[2026-01-08 18:52:09 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 3.873691s, mse: 0.20823501
|
| 242 |
+
[2026-01-08 18:52:13 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.869580s, mse: 0.20746952
|
| 243 |
+
[2026-01-08 18:52:17 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.872574s, mse: 0.20690618
|
| 244 |
+
[2026-01-08 18:52:21 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.870390s, mse: 0.20613439
|
| 245 |
+
[2026-01-08 18:52:25 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.867468s, mse: 0.20562243
|
| 246 |
+
[2026-01-08 18:52:29 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 3.874285s, mse: 0.20517452
|
| 247 |
+
[2026-01-08 18:52:33 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 3.867692s, mse: 0.20504668
|
| 248 |
+
[2026-01-08 18:52:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 249 |
+
[2026-01-08 18:52:34 root] (train_utils.py 108): INFO ========= Layer 11 =========
|
| 250 |
+
[2026-01-08 18:52:42 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 5.363442s, mse: 0.39262417
|
| 251 |
+
[2026-01-08 18:52:46 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 3.927743s, mse: 0.27127978
|
| 252 |
+
[2026-01-08 18:52:49 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 3.871058s, mse: 0.22630122
|
| 253 |
+
[2026-01-08 18:52:53 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 3.867345s, mse: 0.21789221
|
| 254 |
+
[2026-01-08 18:52:57 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 3.870425s, mse: 0.21573043
|
| 255 |
+
[2026-01-08 18:53:01 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 3.883979s, mse: 0.21401882
|
| 256 |
+
[2026-01-08 18:53:05 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 3.875270s, mse: 0.21313243
|
| 257 |
+
[2026-01-08 18:53:09 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 3.878757s, mse: 0.21215978
|
| 258 |
+
[2026-01-08 18:53:13 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 3.868496s, mse: 0.21121168
|
| 259 |
+
[2026-01-08 18:53:17 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.869603s, mse: 0.21032479
|
| 260 |
+
[2026-01-08 18:53:20 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.874183s, mse: 0.20987187
|
| 261 |
+
[2026-01-08 18:53:24 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.864224s, mse: 0.20908046
|
| 262 |
+
[2026-01-08 18:53:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 3.869574s, mse: 0.20848191
|
| 263 |
+
[2026-01-08 18:53:32 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 3.873634s, mse: 0.20800886
|
| 264 |
+
[2026-01-08 18:53:36 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 3.872741s, mse: 0.20795538
|
| 265 |
+
[2026-01-08 18:53:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 266 |
+
[2026-01-08 18:53:37 root] (train_utils.py 108): INFO ========= Layer 12 =========
|
| 267 |
+
[2026-01-08 18:53:45 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 5.307302s, mse: 0.43535280
|
| 268 |
+
[2026-01-08 18:53:49 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 3.867751s, mse: 0.29579335
|
| 269 |
+
[2026-01-08 18:53:53 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 3.869383s, mse: 0.24488190
|
| 270 |
+
[2026-01-08 18:53:57 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 3.869865s, mse: 0.23438135
|
| 271 |
+
[2026-01-08 18:54:01 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 3.875870s, mse: 0.23133603
|
| 272 |
+
[2026-01-08 18:54:04 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 3.872000s, mse: 0.22933656
|
| 273 |
+
[2026-01-08 18:54:08 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 3.880461s, mse: 0.22804067
|
| 274 |
+
[2026-01-08 18:54:12 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 3.874234s, mse: 0.22690852
|
| 275 |
+
[2026-01-08 18:54:16 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.875051s, mse: 0.22579126
|
| 276 |
+
[2026-01-08 18:54:20 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.877198s, mse: 0.22475064
|
| 277 |
+
[2026-01-08 18:54:24 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.872357s, mse: 0.22366890
|
| 278 |
+
[2026-01-08 18:54:28 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.870901s, mse: 0.22277188
|
| 279 |
+
[2026-01-08 18:54:32 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.872854s, mse: 0.22196589
|
| 280 |
+
[2026-01-08 18:54:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.894168s, mse: 0.22144113
|
| 281 |
+
[2026-01-08 18:54:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.869174s, mse: 0.22116731
|
| 282 |
+
[2026-01-08 18:54:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 283 |
+
[2026-01-08 18:54:41 root] (train_utils.py 108): INFO ========= Layer 13 =========
|
| 284 |
+
[2026-01-08 18:54:48 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 5.302742s, mse: 0.44991863
|
| 285 |
+
[2026-01-08 18:54:52 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.929768s, mse: 0.30773303
|
| 286 |
+
[2026-01-08 18:54:56 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.868324s, mse: 0.25602528
|
| 287 |
+
[2026-01-08 18:55:00 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.871515s, mse: 0.24593170
|
| 288 |
+
[2026-01-08 18:55:04 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.868756s, mse: 0.24332635
|
| 289 |
+
[2026-01-08 18:55:08 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.875814s, mse: 0.24169515
|
| 290 |
+
[2026-01-08 18:55:12 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.877859s, mse: 0.24032030
|
| 291 |
+
[2026-01-08 18:55:16 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.871221s, mse: 0.23895445
|
| 292 |
+
[2026-01-08 18:55:19 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.870597s, mse: 0.23795472
|
| 293 |
+
[2026-01-08 18:55:23 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.872128s, mse: 0.23691620
|
| 294 |
+
[2026-01-08 18:55:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.870418s, mse: 0.23617835
|
| 295 |
+
[2026-01-08 18:55:31 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 3.875942s, mse: 0.23538260
|
| 296 |
+
[2026-01-08 18:55:35 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 3.872318s, mse: 0.23459788
|
| 297 |
+
[2026-01-08 18:55:39 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 3.874342s, mse: 0.23386008
|
| 298 |
+
[2026-01-08 18:55:43 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 3.876184s, mse: 0.23347831
|
| 299 |
+
[2026-01-08 18:55:43 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 300 |
+
[2026-01-08 18:55:44 root] (train_utils.py 108): INFO ========= Layer 14 =========
|
| 301 |
+
[2026-01-08 18:55:52 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 5.314241s, mse: 0.48670265
|
| 302 |
+
[2026-01-08 18:55:56 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.883879s, mse: 0.32924685
|
| 303 |
+
[2026-01-08 18:55:59 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 3.871489s, mse: 0.27174610
|
| 304 |
+
[2026-01-08 18:56:03 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 3.877281s, mse: 0.26111004
|
| 305 |
+
[2026-01-08 18:56:07 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 3.878602s, mse: 0.25857583
|
| 306 |
+
[2026-01-08 18:56:11 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 3.884447s, mse: 0.25724220
|
| 307 |
+
[2026-01-08 18:56:15 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 3.877262s, mse: 0.25530052
|
| 308 |
+
[2026-01-08 18:56:19 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 3.876701s, mse: 0.25373703
|
| 309 |
+
[2026-01-08 18:56:23 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.869868s, mse: 0.25232333
|
| 310 |
+
[2026-01-08 18:56:27 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 3.867889s, mse: 0.25103748
|
| 311 |
+
[2026-01-08 18:56:30 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 3.876401s, mse: 0.24987648
|
| 312 |
+
[2026-01-08 18:56:34 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 3.875775s, mse: 0.24912813
|
| 313 |
+
[2026-01-08 18:56:38 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 3.870832s, mse: 0.24813016
|
| 314 |
+
[2026-01-08 18:56:42 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 3.870428s, mse: 0.24762598
|
| 315 |
+
[2026-01-08 18:56:46 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.866288s, mse: 0.24739194
|
| 316 |
+
[2026-01-08 18:56:46 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 317 |
+
[2026-01-08 18:56:47 root] (train_utils.py 108): INFO ========= Layer 15 =========
|
| 318 |
+
[2026-01-08 18:56:56 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 5.946069s, mse: 0.48941827
|
| 319 |
+
[2026-01-08 18:57:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 3.964455s, mse: 0.32720220
|
| 320 |
+
[2026-01-08 18:57:03 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 3.909539s, mse: 0.26854873
|
| 321 |
+
[2026-01-08 18:57:07 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 3.873296s, mse: 0.25705975
|
| 322 |
+
[2026-01-08 18:57:11 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 3.876359s, mse: 0.25422159
|
| 323 |
+
[2026-01-08 18:57:15 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 3.876583s, mse: 0.25197345
|
| 324 |
+
[2026-01-08 18:57:19 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 3.870843s, mse: 0.25026903
|
| 325 |
+
[2026-01-08 18:57:23 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.871294s, mse: 0.24867499
|
| 326 |
+
[2026-01-08 18:57:27 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 3.868954s, mse: 0.24771519
|
| 327 |
+
[2026-01-08 18:57:31 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 3.875140s, mse: 0.24665023
|
| 328 |
+
[2026-01-08 18:57:34 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 3.873682s, mse: 0.24558856
|
| 329 |
+
[2026-01-08 18:57:38 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 3.872494s, mse: 0.24435455
|
| 330 |
+
[2026-01-08 18:57:42 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 3.873586s, mse: 0.24346027
|
| 331 |
+
[2026-01-08 18:57:46 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 3.890473s, mse: 0.24292424
|
| 332 |
+
[2026-01-08 18:57:50 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 3.872211s, mse: 0.24260354
|
| 333 |
+
[2026-01-08 18:57:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 334 |
+
[2026-01-08 18:57:51 root] (train_utils.py 108): INFO ========= Layer 16 =========
|
| 335 |
+
[2026-01-08 18:57:58 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 4.626063s, mse: 3.09758520
|
| 336 |
+
[2026-01-08 18:58:02 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.892791s, mse: 1.53681600
|
| 337 |
+
[2026-01-08 18:58:06 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.875230s, mse: 1.37538433
|
| 338 |
+
[2026-01-08 18:58:10 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.887590s, mse: 1.14041376
|
| 339 |
+
[2026-01-08 18:58:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 3.873685s, mse: 1.13041377
|
| 340 |
+
[2026-01-08 18:58:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 3.886949s, mse: 1.17505825
|
| 341 |
+
[2026-01-08 18:58:21 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 3.875163s, mse: 1.00187659
|
| 342 |
+
[2026-01-08 18:58:25 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 3.873014s, mse: 1.15916288
|
| 343 |
+
[2026-01-08 18:58:29 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 3.870400s, mse: 0.93556213
|
| 344 |
+
[2026-01-08 18:58:33 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 3.883701s, mse: 0.89307052
|
| 345 |
+
[2026-01-08 18:58:37 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.874257s, mse: 1.08854449
|
| 346 |
+
[2026-01-08 18:58:41 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.872130s, mse: 0.78587675
|
| 347 |
+
[2026-01-08 18:58:45 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.872023s, mse: 0.77024889
|
| 348 |
+
[2026-01-08 18:58:49 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.877707s, mse: 0.74143833
|
| 349 |
+
[2026-01-08 18:58:52 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.870870s, mse: 0.62904388
|
| 350 |
+
[2026-01-08 18:58:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 351 |
+
[2026-01-08 18:58:54 root] (train_utils.py 108): INFO ========= Layer 17 =========
|
| 352 |
+
[2026-01-08 18:59:02 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 5.444665s, mse: 0.57632238
|
| 353 |
+
[2026-01-08 18:59:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 3.880440s, mse: 0.38568184
|
| 354 |
+
[2026-01-08 18:59:09 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 3.868829s, mse: 0.30990756
|
| 355 |
+
[2026-01-08 18:59:13 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 3.874089s, mse: 0.29348093
|
| 356 |
+
[2026-01-08 18:59:17 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 3.880884s, mse: 0.28841209
|
| 357 |
+
[2026-01-08 18:59:21 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 3.874481s, mse: 0.28536177
|
| 358 |
+
[2026-01-08 18:59:25 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 3.876625s, mse: 0.28336507
|
| 359 |
+
[2026-01-08 18:59:29 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 3.883069s, mse: 0.28023016
|
| 360 |
+
[2026-01-08 18:59:33 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 3.871980s, mse: 0.27797151
|
| 361 |
+
[2026-01-08 18:59:37 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 3.875011s, mse: 0.27724716
|
| 362 |
+
[2026-01-08 18:59:40 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.871064s, mse: 0.27549568
|
| 363 |
+
[2026-01-08 18:59:44 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.874846s, mse: 0.27411795
|
| 364 |
+
[2026-01-08 18:59:48 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.875467s, mse: 0.27230272
|
| 365 |
+
[2026-01-08 18:59:52 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.871872s, mse: 0.27161792
|
| 366 |
+
[2026-01-08 18:59:56 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 3.877031s, mse: 0.27142629
|
| 367 |
+
[2026-01-08 18:59:56 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 368 |
+
[2026-01-08 18:59:57 root] (train_utils.py 108): INFO ========= Layer 18 =========
|
| 369 |
+
[2026-01-08 19:00:05 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 5.037441s, mse: 0.68219566
|
| 370 |
+
[2026-01-08 19:00:09 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 3.873358s, mse: 0.44933167
|
| 371 |
+
[2026-01-08 19:00:13 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 3.869275s, mse: 0.36149144
|
| 372 |
+
[2026-01-08 19:00:17 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 3.869983s, mse: 0.34437451
|
| 373 |
+
[2026-01-08 19:00:21 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 3.867804s, mse: 0.33928376
|
| 374 |
+
[2026-01-08 19:00:24 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 3.872184s, mse: 0.33628541
|
| 375 |
+
[2026-01-08 19:00:28 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 3.868960s, mse: 0.33380261
|
| 376 |
+
[2026-01-08 19:00:32 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 3.872158s, mse: 0.33132178
|
| 377 |
+
[2026-01-08 19:00:36 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 3.871630s, mse: 0.32943395
|
| 378 |
+
[2026-01-08 19:00:40 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 3.873256s, mse: 0.32786560
|
| 379 |
+
[2026-01-08 19:00:44 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.874281s, mse: 0.32583937
|
| 380 |
+
[2026-01-08 19:00:48 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.869690s, mse: 0.32450172
|
| 381 |
+
[2026-01-08 19:00:52 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.871679s, mse: 0.32264820
|
| 382 |
+
[2026-01-08 19:00:55 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.878042s, mse: 0.32187557
|
| 383 |
+
[2026-01-08 19:00:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.873233s, mse: 0.32105669
|
| 384 |
+
[2026-01-08 19:01:00 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 385 |
+
[2026-01-08 19:01:01 root] (train_utils.py 108): INFO ========= Layer 19 =========
|
| 386 |
+
[2026-01-08 19:01:10 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 5.786413s, mse: 0.88728219
|
| 387 |
+
[2026-01-08 19:01:14 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 4.102961s, mse: 0.57078516
|
| 388 |
+
[2026-01-08 19:01:18 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 3.876380s, mse: 0.45792666
|
| 389 |
+
[2026-01-08 19:01:22 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.871944s, mse: 0.43537480
|
| 390 |
+
[2026-01-08 19:01:25 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.870567s, mse: 0.42894897
|
| 391 |
+
[2026-01-08 19:01:29 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.872926s, mse: 0.42462113
|
| 392 |
+
[2026-01-08 19:01:33 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.871444s, mse: 0.42157629
|
| 393 |
+
[2026-01-08 19:01:37 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.871675s, mse: 0.41864219
|
| 394 |
+
[2026-01-08 19:01:41 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.871620s, mse: 0.41570342
|
| 395 |
+
[2026-01-08 19:01:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.872640s, mse: 0.41345572
|
| 396 |
+
[2026-01-08 19:01:49 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.872133s, mse: 0.41054672
|
| 397 |
+
[2026-01-08 19:01:52 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.871707s, mse: 0.40846488
|
| 398 |
+
[2026-01-08 19:01:56 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.876148s, mse: 0.40727249
|
| 399 |
+
[2026-01-08 19:02:00 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.881454s, mse: 0.40628025
|
| 400 |
+
[2026-01-08 19:02:04 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 3.875313s, mse: 0.40573606
|
| 401 |
+
[2026-01-08 19:02:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 402 |
+
[2026-01-08 19:02:06 root] (train_utils.py 108): INFO ========= Layer 20 =========
|
| 403 |
+
[2026-01-08 19:02:15 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 5.941118s, mse: 0.88836050
|
| 404 |
+
[2026-01-08 19:02:19 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 3.928517s, mse: 0.59483135
|
| 405 |
+
[2026-01-08 19:02:23 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 3.871794s, mse: 0.48579982
|
| 406 |
+
[2026-01-08 19:02:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.873976s, mse: 0.46583182
|
| 407 |
+
[2026-01-08 19:02:31 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 3.873488s, mse: 0.46044937
|
| 408 |
+
[2026-01-08 19:02:34 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 3.871722s, mse: 0.45749170
|
| 409 |
+
[2026-01-08 19:02:38 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 3.879350s, mse: 0.45316568
|
| 410 |
+
[2026-01-08 19:02:42 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 3.870425s, mse: 0.45053339
|
| 411 |
+
[2026-01-08 19:02:46 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 3.879096s, mse: 0.44832462
|
| 412 |
+
[2026-01-08 19:02:50 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 3.866863s, mse: 0.44616416
|
| 413 |
+
[2026-01-08 19:02:54 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.871922s, mse: 0.44334349
|
| 414 |
+
[2026-01-08 19:02:58 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 3.874037s, mse: 0.44204527
|
| 415 |
+
[2026-01-08 19:03:02 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 3.872909s, mse: 0.43987796
|
| 416 |
+
[2026-01-08 19:03:05 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 3.871884s, mse: 0.43863490
|
| 417 |
+
[2026-01-08 19:03:09 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 3.880204s, mse: 0.43791217
|
| 418 |
+
[2026-01-08 19:03:10 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 419 |
+
[2026-01-08 19:03:11 root] (train_utils.py 108): INFO ========= Layer 21 =========
|
| 420 |
+
[2026-01-08 19:03:20 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.436487s, mse: 1.18043423
|
| 421 |
+
[2026-01-08 19:03:24 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.929286s, mse: 0.77954561
|
| 422 |
+
[2026-01-08 19:03:27 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 3.871185s, mse: 0.64111829
|
| 423 |
+
[2026-01-08 19:03:31 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 3.875395s, mse: 0.61397409
|
| 424 |
+
[2026-01-08 19:03:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 3.866674s, mse: 0.60631013
|
| 425 |
+
[2026-01-08 19:03:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 3.872060s, mse: 0.60047567
|
| 426 |
+
[2026-01-08 19:03:43 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 3.872387s, mse: 0.59512597
|
| 427 |
+
[2026-01-08 19:03:47 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 3.870912s, mse: 0.59215677
|
| 428 |
+
[2026-01-08 19:03:51 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 3.874108s, mse: 0.58796024
|
| 429 |
+
[2026-01-08 19:03:55 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 3.873752s, mse: 0.58513182
|
| 430 |
+
[2026-01-08 19:03:58 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.870490s, mse: 0.58225924
|
| 431 |
+
[2026-01-08 19:04:02 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 3.874893s, mse: 0.57988369
|
| 432 |
+
[2026-01-08 19:04:06 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 3.876416s, mse: 0.57718277
|
| 433 |
+
[2026-01-08 19:04:10 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 3.874704s, mse: 0.57546204
|
| 434 |
+
[2026-01-08 19:04:14 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 3.996245s, mse: 0.57469940
|
| 435 |
+
[2026-01-08 19:04:14 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 436 |
+
[2026-01-08 19:04:16 root] (train_utils.py 108): INFO ========= Layer 22 =========
|
| 437 |
+
[2026-01-08 19:04:23 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.016886s, mse: 1.88664389
|
| 438 |
+
[2026-01-08 19:04:27 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 3.875092s, mse: 1.18959606
|
| 439 |
+
[2026-01-08 19:04:31 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 3.873603s, mse: 0.95907360
|
| 440 |
+
[2026-01-08 19:04:35 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.870588s, mse: 0.91428280
|
| 441 |
+
[2026-01-08 19:04:39 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.879905s, mse: 0.90376323
|
| 442 |
+
[2026-01-08 19:04:43 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.871378s, mse: 0.89363086
|
| 443 |
+
[2026-01-08 19:04:47 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 3.873839s, mse: 0.88751125
|
| 444 |
+
[2026-01-08 19:04:51 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 3.877680s, mse: 0.87932986
|
| 445 |
+
[2026-01-08 19:04:54 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 3.870394s, mse: 0.87506205
|
| 446 |
+
[2026-01-08 19:04:58 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 3.868981s, mse: 0.86960399
|
| 447 |
+
[2026-01-08 19:05:02 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 3.871379s, mse: 0.86433518
|
| 448 |
+
[2026-01-08 19:05:06 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 3.873498s, mse: 0.85831034
|
| 449 |
+
[2026-01-08 19:05:10 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.875648s, mse: 0.85434479
|
| 450 |
+
[2026-01-08 19:05:14 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.873622s, mse: 0.85274106
|
| 451 |
+
[2026-01-08 19:05:18 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.876666s, mse: 0.85105854
|
| 452 |
+
[2026-01-08 19:05:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 453 |
+
[2026-01-08 19:05:19 root] (train_utils.py 108): INFO ========= Layer 23 =========
|
| 454 |
+
[2026-01-08 19:05:27 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 5.240867s, mse: 2.56160784
|
| 455 |
+
[2026-01-08 19:05:31 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 3.870888s, mse: 1.69400561
|
| 456 |
+
[2026-01-08 19:05:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 3.871142s, mse: 1.40092814
|
| 457 |
+
[2026-01-08 19:05:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 3.871503s, mse: 1.33960748
|
| 458 |
+
[2026-01-08 19:05:42 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 3.876095s, mse: 1.31923652
|
| 459 |
+
[2026-01-08 19:05:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 3.883029s, mse: 1.30260742
|
| 460 |
+
[2026-01-08 19:05:50 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 3.871961s, mse: 1.29341400
|
| 461 |
+
[2026-01-08 19:05:54 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 3.867985s, mse: 1.28473794
|
| 462 |
+
[2026-01-08 19:05:58 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 3.873707s, mse: 1.27725101
|
| 463 |
+
[2026-01-08 19:06:02 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 3.868931s, mse: 1.27071691
|
| 464 |
+
[2026-01-08 19:06:05 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 3.874512s, mse: 1.26552820
|
| 465 |
+
[2026-01-08 19:06:09 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.869352s, mse: 1.26018000
|
| 466 |
+
[2026-01-08 19:06:13 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.874362s, mse: 1.25696874
|
| 467 |
+
[2026-01-08 19:06:17 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.875837s, mse: 1.25348544
|
| 468 |
+
[2026-01-08 19:06:21 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 3.874819s, mse: 1.25113153
|
| 469 |
+
[2026-01-08 19:06:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 470 |
+
[2026-01-08 19:06:22 root] (train_utils.py 108): INFO ========= Layer 24 =========
|
| 471 |
+
[2026-01-08 19:06:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 5.198261s, mse: 3.33080626
|
| 472 |
+
[2026-01-08 19:06:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 3.873992s, mse: 2.21739531
|
| 473 |
+
[2026-01-08 19:06:38 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 3.884522s, mse: 1.83558488
|
| 474 |
+
[2026-01-08 19:06:42 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 3.872766s, mse: 1.75192118
|
| 475 |
+
[2026-01-08 19:06:45 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 3.874427s, mse: 1.73021388
|
| 476 |
+
[2026-01-08 19:06:49 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 3.869450s, mse: 1.70965135
|
| 477 |
+
[2026-01-08 19:06:53 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 3.871820s, mse: 1.69753647
|
| 478 |
+
[2026-01-08 19:06:57 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 3.876565s, mse: 1.68364048
|
| 479 |
+
[2026-01-08 19:07:01 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 3.880944s, mse: 1.67123342
|
| 480 |
+
[2026-01-08 19:07:05 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 3.879658s, mse: 1.66224420
|
| 481 |
+
[2026-01-08 19:07:09 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 3.870554s, mse: 1.65476453
|
| 482 |
+
[2026-01-08 19:07:13 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.873933s, mse: 1.64498436
|
| 483 |
+
[2026-01-08 19:07:16 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 3.881927s, mse: 1.63647079
|
| 484 |
+
[2026-01-08 19:07:20 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 3.875716s, mse: 1.63291585
|
| 485 |
+
[2026-01-08 19:07:25 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 4.456192s, mse: 1.63007939
|
| 486 |
+
[2026-01-08 19:07:25 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 487 |
+
[2026-01-08 19:07:27 root] (train_utils.py 108): INFO ========= Layer 25 =========
|
| 488 |
+
[2026-01-08 19:07:35 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 5.174078s, mse: 3.67945337
|
| 489 |
+
[2026-01-08 19:07:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.928041s, mse: 2.39840055
|
| 490 |
+
[2026-01-08 19:07:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.927209s, mse: 2.00158238
|
| 491 |
+
[2026-01-08 19:07:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.886188s, mse: 1.92655563
|
| 492 |
+
[2026-01-08 19:07:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.873621s, mse: 1.90741169
|
| 493 |
+
[2026-01-08 19:07:54 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.874353s, mse: 1.89064825
|
| 494 |
+
[2026-01-08 19:07:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.867891s, mse: 1.88254857
|
| 495 |
+
[2026-01-08 19:08:02 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.872960s, mse: 1.87189174
|
| 496 |
+
[2026-01-08 19:08:06 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 3.890402s, mse: 1.86226833
|
| 497 |
+
[2026-01-08 19:08:10 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.876318s, mse: 1.85414529
|
| 498 |
+
[2026-01-08 19:08:13 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.869507s, mse: 1.84632003
|
| 499 |
+
[2026-01-08 19:08:17 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.872166s, mse: 1.83962476
|
| 500 |
+
[2026-01-08 19:08:21 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.871724s, mse: 1.83272731
|
| 501 |
+
[2026-01-08 19:08:25 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 3.872951s, mse: 1.83188641
|
| 502 |
+
[2026-01-08 19:08:29 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 3.873922s, mse: 1.82856822
|
| 503 |
+
[2026-01-08 19:08:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 504 |
+
[2026-01-08 19:08:30 root] (train_utils.py 108): INFO ========= Layer 26 =========
|
| 505 |
+
[2026-01-08 19:08:38 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.560466s, mse: 4.35819054
|
| 506 |
+
[2026-01-08 19:08:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 3.936876s, mse: 2.94494462
|
| 507 |
+
[2026-01-08 19:08:46 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 3.871977s, mse: 2.46222878
|
| 508 |
+
[2026-01-08 19:08:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 3.870587s, mse: 2.36697221
|
| 509 |
+
[2026-01-08 19:08:54 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.872370s, mse: 2.34871936
|
| 510 |
+
[2026-01-08 19:08:58 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 3.873634s, mse: 2.33013940
|
| 511 |
+
[2026-01-08 19:09:01 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 3.881063s, mse: 2.31725478
|
| 512 |
+
[2026-01-08 19:09:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 3.873708s, mse: 2.30295658
|
| 513 |
+
[2026-01-08 19:09:09 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 3.880649s, mse: 2.29171467
|
| 514 |
+
[2026-01-08 19:09:13 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 3.878683s, mse: 2.28112888
|
| 515 |
+
[2026-01-08 19:09:17 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.870284s, mse: 2.27260423
|
| 516 |
+
[2026-01-08 19:09:21 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.876089s, mse: 2.26187754
|
| 517 |
+
[2026-01-08 19:09:25 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 3.881677s, mse: 2.25517917
|
| 518 |
+
[2026-01-08 19:09:29 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 3.872640s, mse: 2.24800634
|
| 519 |
+
[2026-01-08 19:09:32 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 3.872203s, mse: 2.24403787
|
| 520 |
+
[2026-01-08 19:09:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 521 |
+
[2026-01-08 19:09:34 root] (train_utils.py 108): INFO ========= Layer 27 =========
|
| 522 |
+
[2026-01-08 19:09:42 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 5.206716s, mse: 5.94560862
|
| 523 |
+
[2026-01-08 19:09:45 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.875546s, mse: 3.95834851
|
| 524 |
+
[2026-01-08 19:09:49 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 3.878089s, mse: 3.32281756
|
| 525 |
+
[2026-01-08 19:09:53 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 3.869850s, mse: 3.18086267
|
| 526 |
+
[2026-01-08 19:09:57 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 3.880871s, mse: 3.14467168
|
| 527 |
+
[2026-01-08 19:10:01 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 3.874153s, mse: 3.12000346
|
| 528 |
+
[2026-01-08 19:10:05 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 3.872086s, mse: 3.09776139
|
| 529 |
+
[2026-01-08 19:10:09 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 3.870448s, mse: 3.07834363
|
| 530 |
+
[2026-01-08 19:10:12 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 3.872187s, mse: 3.06277657
|
| 531 |
+
[2026-01-08 19:10:16 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 3.868123s, mse: 3.04591680
|
| 532 |
+
[2026-01-08 19:10:20 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.875916s, mse: 3.03134632
|
| 533 |
+
[2026-01-08 19:10:24 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 3.873957s, mse: 3.01916480
|
| 534 |
+
[2026-01-08 19:10:28 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 3.869634s, mse: 3.00719571
|
| 535 |
+
[2026-01-08 19:10:32 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 3.868931s, mse: 2.99984956
|
| 536 |
+
[2026-01-08 19:10:36 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 3.875341s, mse: 2.99120903
|
| 537 |
+
[2026-01-08 19:10:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 538 |
+
[2026-01-08 19:10:37 root] (train_utils.py 108): INFO ========= Layer 28 =========
|
| 539 |
+
[2026-01-08 19:10:45 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 5.016971s, mse: 8.40579605
|
| 540 |
+
[2026-01-08 19:10:48 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 3.877848s, mse: 5.55529737
|
| 541 |
+
[2026-01-08 19:10:52 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 3.868832s, mse: 4.64479589
|
| 542 |
+
[2026-01-08 19:10:56 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 3.866925s, mse: 4.46341419
|
| 543 |
+
[2026-01-08 19:11:00 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.882068s, mse: 4.40386772
|
| 544 |
+
[2026-01-08 19:11:04 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.872863s, mse: 4.37245226
|
| 545 |
+
[2026-01-08 19:11:08 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 3.871567s, mse: 4.34240580
|
| 546 |
+
[2026-01-08 19:11:12 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 3.868959s, mse: 4.31763363
|
| 547 |
+
[2026-01-08 19:11:15 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 3.876480s, mse: 4.29854107
|
| 548 |
+
[2026-01-08 19:11:19 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 3.870713s, mse: 4.28071547
|
| 549 |
+
[2026-01-08 19:11:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 3.866973s, mse: 4.26679897
|
| 550 |
+
[2026-01-08 19:11:27 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 3.869495s, mse: 4.24268007
|
| 551 |
+
[2026-01-08 19:11:31 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 3.870375s, mse: 4.22641373
|
| 552 |
+
[2026-01-08 19:11:35 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.871195s, mse: 4.22128248
|
| 553 |
+
[2026-01-08 19:11:39 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.868412s, mse: 4.21494389
|
| 554 |
+
[2026-01-08 19:11:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 555 |
+
[2026-01-08 19:11:39 root] (train_utils.py 108): INFO ========= Layer 29 =========
|
| 556 |
+
[2026-01-08 19:11:47 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 5.232268s, mse: 10.38746834
|
| 557 |
+
[2026-01-08 19:11:51 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 4.035455s, mse: 7.14648628
|
| 558 |
+
[2026-01-08 19:11:55 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 3.977416s, mse: 6.03318691
|
| 559 |
+
[2026-01-08 19:11:59 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 4.021685s, mse: 5.78764057
|
| 560 |
+
[2026-01-08 19:12:03 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 3.906455s, mse: 5.71550655
|
| 561 |
+
[2026-01-08 19:12:07 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 3.877396s, mse: 5.66473246
|
| 562 |
+
[2026-01-08 19:12:11 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 3.871670s, mse: 5.61916113
|
| 563 |
+
[2026-01-08 19:12:14 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 3.873803s, mse: 5.58458805
|
| 564 |
+
[2026-01-08 19:12:18 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 3.873369s, mse: 5.54784393
|
| 565 |
+
[2026-01-08 19:12:22 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 3.873344s, mse: 5.52231646
|
| 566 |
+
[2026-01-08 19:12:26 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 3.877249s, mse: 5.48976994
|
| 567 |
+
[2026-01-08 19:12:30 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.912101s, mse: 5.46507311
|
| 568 |
+
[2026-01-08 19:12:34 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.878016s, mse: 5.44575977
|
| 569 |
+
[2026-01-08 19:12:38 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 3.873230s, mse: 5.43577242
|
| 570 |
+
[2026-01-08 19:12:42 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 3.871523s, mse: 5.42604542
|
| 571 |
+
[2026-01-08 19:12:42 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 572 |
+
[2026-01-08 19:12:43 root] (train_utils.py 108): INFO ========= Layer 30 =========
|
| 573 |
+
[2026-01-08 19:12:50 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 4.795969s, mse: 16.29405975
|
| 574 |
+
[2026-01-08 19:12:54 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 3.870167s, mse: 11.01632500
|
| 575 |
+
[2026-01-08 19:12:58 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 3.869613s, mse: 9.27882481
|
| 576 |
+
[2026-01-08 19:13:02 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 3.875138s, mse: 8.87542439
|
| 577 |
+
[2026-01-08 19:13:06 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 3.935740s, mse: 8.75351048
|
| 578 |
+
[2026-01-08 19:13:09 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 3.915037s, mse: 8.65880680
|
| 579 |
+
[2026-01-08 19:13:13 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 3.883668s, mse: 8.60634327
|
| 580 |
+
[2026-01-08 19:13:17 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 3.902720s, mse: 8.53597736
|
| 581 |
+
[2026-01-08 19:13:21 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 3.875391s, mse: 8.50352001
|
| 582 |
+
[2026-01-08 19:13:25 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 3.871288s, mse: 8.44190311
|
| 583 |
+
[2026-01-08 19:13:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.872681s, mse: 8.40491486
|
| 584 |
+
[2026-01-08 19:13:33 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.878355s, mse: 8.38511753
|
| 585 |
+
[2026-01-08 19:13:37 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.878202s, mse: 8.35692787
|
| 586 |
+
[2026-01-08 19:13:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.878534s, mse: 8.35674667
|
| 587 |
+
[2026-01-08 19:13:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.874162s, mse: 8.34408569
|
| 588 |
+
[2026-01-08 19:13:45 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 589 |
+
[2026-01-08 19:13:45 root] (train_utils.py 108): INFO ========= Layer 31 =========
|
| 590 |
+
[2026-01-08 19:13:53 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 5.028024s, mse: 20.78250885
|
| 591 |
+
[2026-01-08 19:13:57 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 3.877883s, mse: 14.37235165
|
| 592 |
+
[2026-01-08 19:14:01 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.880755s, mse: 12.13233566
|
| 593 |
+
[2026-01-08 19:14:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.879653s, mse: 11.62570667
|
| 594 |
+
[2026-01-08 19:14:08 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.874181s, mse: 11.51362991
|
| 595 |
+
[2026-01-08 19:14:12 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.873015s, mse: 11.42485142
|
| 596 |
+
[2026-01-08 19:14:16 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.880170s, mse: 11.33607769
|
| 597 |
+
[2026-01-08 19:14:20 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.871797s, mse: 11.27843571
|
| 598 |
+
[2026-01-08 19:14:24 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.874252s, mse: 11.22037888
|
| 599 |
+
[2026-01-08 19:14:28 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.875672s, mse: 11.15839195
|
| 600 |
+
[2026-01-08 19:14:32 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.878424s, mse: 11.12734127
|
| 601 |
+
[2026-01-08 19:14:35 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.873094s, mse: 11.08810806
|
| 602 |
+
[2026-01-08 19:14:39 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.871928s, mse: 11.05513668
|
| 603 |
+
[2026-01-08 19:14:43 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 3.875564s, mse: 11.03436947
|
| 604 |
+
[2026-01-08 19:14:47 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 3.871252s, mse: 11.01393795
|
| 605 |
+
[2026-01-08 19:14:48 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 606 |
+
[2026-01-08 19:14:48 root] (train_utils.py 108): INFO ========= Layer 32 =========
|
| 607 |
+
[2026-01-08 19:14:56 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.154165s, mse: 28.37956429
|
| 608 |
+
[2026-01-08 19:15:00 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 3.881288s, mse: 19.76789856
|
| 609 |
+
[2026-01-08 19:15:04 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 3.875616s, mse: 16.61169624
|
| 610 |
+
[2026-01-08 19:15:08 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.874239s, mse: 15.88970184
|
| 611 |
+
[2026-01-08 19:15:12 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 3.879297s, mse: 15.74769402
|
| 612 |
+
[2026-01-08 19:15:16 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 3.874795s, mse: 15.61922455
|
| 613 |
+
[2026-01-08 19:15:19 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 3.876008s, mse: 15.51004982
|
| 614 |
+
[2026-01-08 19:15:23 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 3.878464s, mse: 15.42904854
|
| 615 |
+
[2026-01-08 19:15:27 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 3.874979s, mse: 15.34880447
|
| 616 |
+
[2026-01-08 19:15:31 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.878422s, mse: 15.27359772
|
| 617 |
+
[2026-01-08 19:15:35 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.886444s, mse: 15.21441174
|
| 618 |
+
[2026-01-08 19:15:39 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 3.877920s, mse: 15.16252708
|
| 619 |
+
[2026-01-08 19:15:43 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 3.875979s, mse: 15.10843849
|
| 620 |
+
[2026-01-08 19:15:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 3.876947s, mse: 15.08382893
|
| 621 |
+
[2026-01-08 19:15:50 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 3.878299s, mse: 15.06546974
|
| 622 |
+
[2026-01-08 19:15:51 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 623 |
+
[2026-01-08 19:15:52 root] (train_utils.py 108): INFO ========= Layer 33 =========
|
| 624 |
+
[2026-01-08 19:15:59 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 5.236542s, mse: 41.54327011
|
| 625 |
+
[2026-01-08 19:16:03 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 3.938019s, mse: 27.93664551
|
| 626 |
+
[2026-01-08 19:16:07 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 3.878417s, mse: 23.32941628
|
| 627 |
+
[2026-01-08 19:16:11 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 3.872800s, mse: 22.34293175
|
| 628 |
+
[2026-01-08 19:16:15 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 3.874553s, mse: 22.07669640
|
| 629 |
+
[2026-01-08 19:16:19 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 3.874539s, mse: 21.87960243
|
| 630 |
+
[2026-01-08 19:16:23 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 3.869958s, mse: 21.73635674
|
| 631 |
+
[2026-01-08 19:16:27 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 3.875664s, mse: 21.58724403
|
| 632 |
+
[2026-01-08 19:16:30 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 3.880439s, mse: 21.46766853
|
| 633 |
+
[2026-01-08 19:16:34 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.872697s, mse: 21.36098099
|
| 634 |
+
[2026-01-08 19:16:38 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 3.876025s, mse: 21.27636719
|
| 635 |
+
[2026-01-08 19:16:42 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 3.871703s, mse: 21.16030693
|
| 636 |
+
[2026-01-08 19:16:46 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 3.870745s, mse: 21.07536125
|
| 637 |
+
[2026-01-08 19:16:50 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 3.876155s, mse: 20.99114990
|
| 638 |
+
[2026-01-08 19:16:54 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 3.873363s, mse: 20.95961761
|
| 639 |
+
[2026-01-08 19:16:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 640 |
+
[2026-01-08 19:16:55 root] (train_utils.py 108): INFO ========= Layer 34 =========
|
| 641 |
+
[2026-01-08 19:17:03 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 4.991384s, mse: 64.93594360
|
| 642 |
+
[2026-01-08 19:17:07 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 3.943139s, mse: 40.86461258
|
| 643 |
+
[2026-01-08 19:17:10 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 3.868030s, mse: 33.65349960
|
| 644 |
+
[2026-01-08 19:17:14 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.868735s, mse: 31.96302605
|
| 645 |
+
[2026-01-08 19:17:18 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.873104s, mse: 31.66926384
|
| 646 |
+
[2026-01-08 19:17:22 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 3.873229s, mse: 31.07656479
|
| 647 |
+
[2026-01-08 19:17:26 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 3.873526s, mse: 30.91048813
|
| 648 |
+
[2026-01-08 19:17:30 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 3.875315s, mse: 30.05115700
|
| 649 |
+
[2026-01-08 19:17:34 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 3.879331s, mse: 29.89023590
|
| 650 |
+
[2026-01-08 19:17:38 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 3.873674s, mse: 30.35319901
|
| 651 |
+
[2026-01-08 19:17:41 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 3.871441s, mse: 29.46559715
|
| 652 |
+
[2026-01-08 19:17:45 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 3.869557s, mse: 29.05239487
|
| 653 |
+
[2026-01-08 19:17:49 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.872165s, mse: 28.86521339
|
| 654 |
+
[2026-01-08 19:17:53 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.871504s, mse: 28.74409676
|
| 655 |
+
[2026-01-08 19:17:57 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 3.877051s, mse: 28.70412636
|
| 656 |
+
[2026-01-08 19:17:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 657 |
+
[2026-01-08 19:17:58 root] (train_utils.py 108): INFO ========= Layer 35 =========
|
| 658 |
+
[2026-01-08 19:18:06 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 5.117816s, mse: 108.25781250
|
| 659 |
+
[2026-01-08 19:18:10 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 3.871820s, mse: 38.04971313
|
| 660 |
+
[2026-01-08 19:18:13 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 3.875288s, mse: 31.63025665
|
| 661 |
+
[2026-01-08 19:18:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 3.870218s, mse: 29.21376991
|
| 662 |
+
[2026-01-08 19:18:21 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 3.873308s, mse: 28.19089508
|
| 663 |
+
[2026-01-08 19:18:25 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 3.872198s, mse: 28.40728760
|
| 664 |
+
[2026-01-08 19:18:29 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 3.873228s, mse: 27.74842644
|
| 665 |
+
[2026-01-08 19:18:33 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 3.872646s, mse: 27.13273811
|
| 666 |
+
[2026-01-08 19:18:37 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 3.887236s, mse: 26.53238487
|
| 667 |
+
[2026-01-08 19:18:41 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 3.929309s, mse: 26.14052200
|
| 668 |
+
[2026-01-08 19:18:44 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.869573s, mse: 25.63203621
|
| 669 |
+
[2026-01-08 19:18:48 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.877343s, mse: 25.35079384
|
| 670 |
+
[2026-01-08 19:18:52 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 3.877298s, mse: 25.21109390
|
| 671 |
+
[2026-01-08 19:18:56 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 3.884227s, mse: 24.95710945
|
| 672 |
+
[2026-01-08 19:19:00 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 3.953963s, mse: 24.85692596
|
| 673 |
+
[2026-01-08 19:19:01 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 674 |
+
[2026-01-08 19:19:38 root] (main.py 39): INFO Finished reparameterize model.
|
| 675 |
+
[2026-01-08 19:20:04 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_195354.txt
ADDED
|
@@ -0,0 +1,680 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-08 19:53:54 root] (args_utils.py 159): INFO Arguments:
|
| 2 |
+
[2026-01-08 19:53:54 root] (args_utils.py 160): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': True,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-08 19:53:54 root] (args_utils.py 161): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-08 19:53:55 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-08 19:54:11 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-08 19:54:16 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-08 19:54:19 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-08 19:54:27 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.696289s, mse: 0.01574295
|
| 64 |
+
[2026-01-08 19:54:31 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.970096s, mse: 0.01115426
|
| 65 |
+
[2026-01-08 19:54:35 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.904277s, mse: 0.00938093
|
| 66 |
+
[2026-01-08 19:54:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.879331s, mse: 0.00881439
|
| 67 |
+
[2026-01-08 19:54:43 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.879196s, mse: 0.00857142
|
| 68 |
+
[2026-01-08 19:54:47 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.875969s, mse: 0.00849318
|
| 69 |
+
[2026-01-08 19:54:51 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.881118s, mse: 0.00832680
|
| 70 |
+
[2026-01-08 19:54:54 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.879780s, mse: 0.00828776
|
| 71 |
+
[2026-01-08 19:54:58 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.879514s, mse: 0.00818714
|
| 72 |
+
[2026-01-08 19:55:02 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.959016s, mse: 0.00813103
|
| 73 |
+
[2026-01-08 19:55:06 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 4.002184s, mse: 0.00808381
|
| 74 |
+
[2026-01-08 19:55:10 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.974104s, mse: 0.00804329
|
| 75 |
+
[2026-01-08 19:55:14 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.941929s, mse: 0.00799941
|
| 76 |
+
[2026-01-08 19:55:18 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.913412s, mse: 0.00795571
|
| 77 |
+
[2026-01-08 19:55:22 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.888769s, mse: 0.00794016
|
| 78 |
+
[2026-01-08 19:55:22 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-08 19:55:23 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
| 80 |
+
[2026-01-08 19:55:30 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.934166s, mse: 0.00892038
|
| 81 |
+
[2026-01-08 19:55:34 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.895962s, mse: 0.00479663
|
| 82 |
+
[2026-01-08 19:55:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.882093s, mse: 0.00384854
|
| 83 |
+
[2026-01-08 19:55:42 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.879302s, mse: 0.00355465
|
| 84 |
+
[2026-01-08 19:55:46 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.870608s, mse: 0.00343135
|
| 85 |
+
[2026-01-08 19:55:50 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.872881s, mse: 0.00337971
|
| 86 |
+
[2026-01-08 19:55:54 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.874244s, mse: 0.00336636
|
| 87 |
+
[2026-01-08 19:55:58 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.877368s, mse: 0.00329515
|
| 88 |
+
[2026-01-08 19:56:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.871886s, mse: 0.00326379
|
| 89 |
+
[2026-01-08 19:56:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.888601s, mse: 0.00321724
|
| 90 |
+
[2026-01-08 19:56:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.873849s, mse: 0.00316591
|
| 91 |
+
[2026-01-08 19:56:13 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.885014s, mse: 0.00313276
|
| 92 |
+
[2026-01-08 19:56:17 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.890614s, mse: 0.00310469
|
| 93 |
+
[2026-01-08 19:56:21 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.876605s, mse: 0.00308243
|
| 94 |
+
[2026-01-08 19:56:25 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.877022s, mse: 0.00306749
|
| 95 |
+
[2026-01-08 19:56:25 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 96 |
+
[2026-01-08 19:56:26 root] (train_utils.py 108): INFO ========= Layer 2 =========
|
| 97 |
+
[2026-01-08 19:56:34 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 5.188274s, mse: 0.01750460
|
| 98 |
+
[2026-01-08 19:56:38 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.876876s, mse: 0.00626545
|
| 99 |
+
[2026-01-08 19:56:42 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.873263s, mse: 0.00494380
|
| 100 |
+
[2026-01-08 19:56:46 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.874986s, mse: 0.00453308
|
| 101 |
+
[2026-01-08 19:56:49 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.879791s, mse: 0.00439964
|
| 102 |
+
[2026-01-08 19:56:53 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.872402s, mse: 0.00429795
|
| 103 |
+
[2026-01-08 19:56:57 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.880752s, mse: 0.00425246
|
| 104 |
+
[2026-01-08 19:57:01 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.871940s, mse: 0.00420888
|
| 105 |
+
[2026-01-08 19:57:05 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.878927s, mse: 0.00415287
|
| 106 |
+
[2026-01-08 19:57:09 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.881111s, mse: 0.00411024
|
| 107 |
+
[2026-01-08 19:57:13 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.875601s, mse: 0.00407672
|
| 108 |
+
[2026-01-08 19:57:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.878016s, mse: 0.00404750
|
| 109 |
+
[2026-01-08 19:57:20 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.881199s, mse: 0.00401742
|
| 110 |
+
[2026-01-08 19:57:24 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.882928s, mse: 0.00398090
|
| 111 |
+
[2026-01-08 19:57:28 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.878617s, mse: 0.00397130
|
| 112 |
+
[2026-01-08 19:57:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 113 |
+
[2026-01-08 19:57:29 root] (train_utils.py 108): INFO ========= Layer 3 =========
|
| 114 |
+
[2026-01-08 19:57:37 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 4.904515s, mse: 0.02308414
|
| 115 |
+
[2026-01-08 19:57:41 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.959069s, mse: 0.01333557
|
| 116 |
+
[2026-01-08 19:57:45 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.885673s, mse: 0.01099337
|
| 117 |
+
[2026-01-08 19:57:49 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.889621s, mse: 0.01028412
|
| 118 |
+
[2026-01-08 19:57:52 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.866672s, mse: 0.01000082
|
| 119 |
+
[2026-01-08 19:57:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.871443s, mse: 0.00980410
|
| 120 |
+
[2026-01-08 19:58:00 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.865910s, mse: 0.00969286
|
| 121 |
+
[2026-01-08 19:58:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.871522s, mse: 0.00956387
|
| 122 |
+
[2026-01-08 19:58:08 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.869807s, mse: 0.00946260
|
| 123 |
+
[2026-01-08 19:58:12 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.870852s, mse: 0.00937346
|
| 124 |
+
[2026-01-08 19:58:16 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.881036s, mse: 0.00926330
|
| 125 |
+
[2026-01-08 19:58:20 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.865621s, mse: 0.00916464
|
| 126 |
+
[2026-01-08 19:58:23 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.870549s, mse: 0.00907166
|
| 127 |
+
[2026-01-08 19:58:27 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.869347s, mse: 0.00904066
|
| 128 |
+
[2026-01-08 19:58:31 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.869239s, mse: 0.00900416
|
| 129 |
+
[2026-01-08 19:58:32 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 130 |
+
[2026-01-08 19:58:33 root] (train_utils.py 108): INFO ========= Layer 4 =========
|
| 131 |
+
[2026-01-08 19:58:41 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 5.550916s, mse: 0.06576648
|
| 132 |
+
[2026-01-08 19:58:45 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.959090s, mse: 0.03741666
|
| 133 |
+
[2026-01-08 19:58:49 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.878123s, mse: 0.03053248
|
| 134 |
+
[2026-01-08 19:58:53 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.880192s, mse: 0.02855516
|
| 135 |
+
[2026-01-08 19:58:57 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.880899s, mse: 0.02790034
|
| 136 |
+
[2026-01-08 19:59:00 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 3.881101s, mse: 0.02746365
|
| 137 |
+
[2026-01-08 19:59:04 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.882933s, mse: 0.02716962
|
| 138 |
+
[2026-01-08 19:59:08 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.879195s, mse: 0.02687641
|
| 139 |
+
[2026-01-08 19:59:12 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.876591s, mse: 0.02662238
|
| 140 |
+
[2026-01-08 19:59:16 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 3.891409s, mse: 0.02643147
|
| 141 |
+
[2026-01-08 19:59:20 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 3.891485s, mse: 0.02624781
|
| 142 |
+
[2026-01-08 19:59:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 3.898774s, mse: 0.02604026
|
| 143 |
+
[2026-01-08 19:59:28 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 3.888591s, mse: 0.02585863
|
| 144 |
+
[2026-01-08 19:59:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 3.887119s, mse: 0.02578292
|
| 145 |
+
[2026-01-08 19:59:35 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 3.895279s, mse: 0.02572995
|
| 146 |
+
[2026-01-08 19:59:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 147 |
+
[2026-01-08 19:59:37 root] (train_utils.py 108): INFO ========= Layer 5 =========
|
| 148 |
+
[2026-01-08 19:59:45 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.898441s, mse: 0.13743916
|
| 149 |
+
[2026-01-08 19:59:49 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 3.961223s, mse: 0.08057592
|
| 150 |
+
[2026-01-08 19:59:53 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 3.889991s, mse: 0.06617787
|
| 151 |
+
[2026-01-08 19:59:57 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 3.895876s, mse: 0.06287611
|
| 152 |
+
[2026-01-08 20:00:01 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 3.896747s, mse: 0.06213523
|
| 153 |
+
[2026-01-08 20:00:05 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.888745s, mse: 0.06160403
|
| 154 |
+
[2026-01-08 20:00:09 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.896078s, mse: 0.06119698
|
| 155 |
+
[2026-01-08 20:00:13 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.890085s, mse: 0.06094177
|
| 156 |
+
[2026-01-08 20:00:16 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.884274s, mse: 0.06060794
|
| 157 |
+
[2026-01-08 20:00:20 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.900461s, mse: 0.06020888
|
| 158 |
+
[2026-01-08 20:00:24 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.890183s, mse: 0.05995716
|
| 159 |
+
[2026-01-08 20:00:28 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.900670s, mse: 0.05978661
|
| 160 |
+
[2026-01-08 20:00:32 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.893111s, mse: 0.05955682
|
| 161 |
+
[2026-01-08 20:00:36 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.897518s, mse: 0.05938030
|
| 162 |
+
[2026-01-08 20:00:40 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.884297s, mse: 0.05934311
|
| 163 |
+
[2026-01-08 20:00:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 164 |
+
[2026-01-08 20:00:53 root] (train_utils.py 108): INFO ========= Layer 6 =========
|
| 165 |
+
[2026-01-08 20:01:02 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 5.689345s, mse: 1.86451793
|
| 166 |
+
[2026-01-08 20:01:06 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.927325s, mse: 0.35658583
|
| 167 |
+
[2026-01-08 20:01:10 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.878829s, mse: 0.32737118
|
| 168 |
+
[2026-01-08 20:01:14 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.883708s, mse: 0.28929594
|
| 169 |
+
[2026-01-08 20:01:18 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.987442s, mse: 0.24128482
|
| 170 |
+
[2026-01-08 20:01:21 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.916989s, mse: 0.21027605
|
| 171 |
+
[2026-01-08 20:01:25 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.879291s, mse: 0.25483868
|
| 172 |
+
[2026-01-08 20:01:29 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.888053s, mse: 0.23871142
|
| 173 |
+
[2026-01-08 20:01:33 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.881018s, mse: 0.21885920
|
| 174 |
+
[2026-01-08 20:01:37 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.884074s, mse: 0.20672695
|
| 175 |
+
[2026-01-08 20:01:41 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.881378s, mse: 0.20202750
|
| 176 |
+
[2026-01-08 20:01:45 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.877917s, mse: 0.17932597
|
| 177 |
+
[2026-01-08 20:01:49 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.878248s, mse: 0.20257902
|
| 178 |
+
[2026-01-08 20:01:53 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.896357s, mse: 0.20667967
|
| 179 |
+
[2026-01-08 20:01:56 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.885994s, mse: 0.16777667
|
| 180 |
+
[2026-01-08 20:01:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 181 |
+
[2026-01-08 20:01:57 root] (train_utils.py 108): INFO ========= Layer 7 =========
|
| 182 |
+
[2026-01-08 20:02:05 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 5.306338s, mse: 0.23462753
|
| 183 |
+
[2026-01-08 20:02:09 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.972694s, mse: 0.14976017
|
| 184 |
+
[2026-01-08 20:02:13 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.913413s, mse: 0.12312289
|
| 185 |
+
[2026-01-08 20:02:17 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.904199s, mse: 0.11779824
|
| 186 |
+
[2026-01-08 20:02:21 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.902115s, mse: 0.11621600
|
| 187 |
+
[2026-01-08 20:02:25 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.879436s, mse: 0.11538153
|
| 188 |
+
[2026-01-08 20:02:29 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.873058s, mse: 0.11461711
|
| 189 |
+
[2026-01-08 20:02:33 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.880082s, mse: 0.11396322
|
| 190 |
+
[2026-01-08 20:02:36 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.880680s, mse: 0.11346199
|
| 191 |
+
[2026-01-08 20:02:40 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.874789s, mse: 0.11303829
|
| 192 |
+
[2026-01-08 20:02:44 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 3.937302s, mse: 0.11244514
|
| 193 |
+
[2026-01-08 20:02:48 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 3.881629s, mse: 0.11193727
|
| 194 |
+
[2026-01-08 20:02:52 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 3.877626s, mse: 0.11167257
|
| 195 |
+
[2026-01-08 20:02:56 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 3.881678s, mse: 0.11139309
|
| 196 |
+
[2026-01-08 20:03:00 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 3.873263s, mse: 0.11127126
|
| 197 |
+
[2026-01-08 20:03:00 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 198 |
+
[2026-01-08 20:03:01 root] (train_utils.py 108): INFO ========= Layer 8 =========
|
| 199 |
+
[2026-01-08 20:03:09 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 5.057469s, mse: 0.31783378
|
| 200 |
+
[2026-01-08 20:03:13 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 3.876861s, mse: 0.21154313
|
| 201 |
+
[2026-01-08 20:03:16 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 3.878272s, mse: 0.17556834
|
| 202 |
+
[2026-01-08 20:03:20 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 3.874820s, mse: 0.16892871
|
| 203 |
+
[2026-01-08 20:03:24 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 3.878854s, mse: 0.16700211
|
| 204 |
+
[2026-01-08 20:03:28 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 3.878486s, mse: 0.16594610
|
| 205 |
+
[2026-01-08 20:03:32 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 3.873478s, mse: 0.16510613
|
| 206 |
+
[2026-01-08 20:03:36 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 3.872782s, mse: 0.16456470
|
| 207 |
+
[2026-01-08 20:03:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 3.875328s, mse: 0.16401851
|
| 208 |
+
[2026-01-08 20:03:44 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 3.876778s, mse: 0.16352586
|
| 209 |
+
[2026-01-08 20:03:47 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 3.876492s, mse: 0.16331530
|
| 210 |
+
[2026-01-08 20:03:51 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 3.863941s, mse: 0.16285881
|
| 211 |
+
[2026-01-08 20:03:55 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 3.864256s, mse: 0.16254890
|
| 212 |
+
[2026-01-08 20:03:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 3.870399s, mse: 0.16240378
|
| 213 |
+
[2026-01-08 20:04:03 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 3.871459s, mse: 0.16246043
|
| 214 |
+
[2026-01-08 20:04:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 215 |
+
[2026-01-08 20:04:04 root] (train_utils.py 108): INFO ========= Layer 9 =========
|
| 216 |
+
[2026-01-08 20:04:12 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 5.024553s, mse: 0.37875688
|
| 217 |
+
[2026-01-08 20:04:16 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 3.871795s, mse: 0.25363240
|
| 218 |
+
[2026-01-08 20:04:20 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 3.871877s, mse: 0.21064380
|
| 219 |
+
[2026-01-08 20:04:23 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 3.864901s, mse: 0.20179385
|
| 220 |
+
[2026-01-08 20:04:27 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 3.868378s, mse: 0.19936548
|
| 221 |
+
[2026-01-08 20:04:31 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.870681s, mse: 0.19817175
|
| 222 |
+
[2026-01-08 20:04:35 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 3.872413s, mse: 0.19703594
|
| 223 |
+
[2026-01-08 20:04:39 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 3.879371s, mse: 0.19626960
|
| 224 |
+
[2026-01-08 20:04:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 3.875587s, mse: 0.19534998
|
| 225 |
+
[2026-01-08 20:04:47 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 3.874845s, mse: 0.19473058
|
| 226 |
+
[2026-01-08 20:04:51 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 3.881644s, mse: 0.19404019
|
| 227 |
+
[2026-01-08 20:04:54 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 3.886687s, mse: 0.19356999
|
| 228 |
+
[2026-01-08 20:04:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 4.398774s, mse: 0.19326007
|
| 229 |
+
[2026-01-08 20:05:03 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 3.888181s, mse: 0.19282311
|
| 230 |
+
[2026-01-08 20:05:07 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 3.880987s, mse: 0.19267595
|
| 231 |
+
[2026-01-08 20:05:07 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 232 |
+
[2026-01-08 20:05:08 root] (train_utils.py 108): INFO ========= Layer 10 =========
|
| 233 |
+
[2026-01-08 20:05:16 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 5.329155s, mse: 0.44592521
|
| 234 |
+
[2026-01-08 20:05:20 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.887850s, mse: 0.28058022
|
| 235 |
+
[2026-01-08 20:05:24 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.882091s, mse: 0.22870731
|
| 236 |
+
[2026-01-08 20:05:28 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 3.881134s, mse: 0.21672769
|
| 237 |
+
[2026-01-08 20:05:32 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 3.886322s, mse: 0.21354958
|
| 238 |
+
[2026-01-08 20:05:36 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 3.892071s, mse: 0.21149486
|
| 239 |
+
[2026-01-08 20:05:40 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 3.897464s, mse: 0.21045262
|
| 240 |
+
[2026-01-08 20:05:44 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 3.917081s, mse: 0.20926467
|
| 241 |
+
[2026-01-08 20:05:47 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 3.884677s, mse: 0.20823501
|
| 242 |
+
[2026-01-08 20:05:51 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.881175s, mse: 0.20746952
|
| 243 |
+
[2026-01-08 20:05:55 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.880876s, mse: 0.20690618
|
| 244 |
+
[2026-01-08 20:05:59 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.879183s, mse: 0.20613439
|
| 245 |
+
[2026-01-08 20:06:03 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.877125s, mse: 0.20562243
|
| 246 |
+
[2026-01-08 20:06:07 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 3.875461s, mse: 0.20517452
|
| 247 |
+
[2026-01-08 20:06:11 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 3.886986s, mse: 0.20504668
|
| 248 |
+
[2026-01-08 20:06:11 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 249 |
+
[2026-01-08 20:06:12 root] (train_utils.py 108): INFO ========= Layer 11 =========
|
| 250 |
+
[2026-01-08 20:06:20 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 5.241796s, mse: 0.39262417
|
| 251 |
+
[2026-01-08 20:06:24 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 3.955906s, mse: 0.27127978
|
| 252 |
+
[2026-01-08 20:06:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 3.879225s, mse: 0.22630122
|
| 253 |
+
[2026-01-08 20:06:31 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 3.883091s, mse: 0.21789221
|
| 254 |
+
[2026-01-08 20:06:35 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 3.883498s, mse: 0.21573043
|
| 255 |
+
[2026-01-08 20:06:39 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 3.888780s, mse: 0.21401882
|
| 256 |
+
[2026-01-08 20:06:43 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 3.883860s, mse: 0.21313243
|
| 257 |
+
[2026-01-08 20:06:47 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 3.908260s, mse: 0.21215978
|
| 258 |
+
[2026-01-08 20:06:51 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 3.875031s, mse: 0.21121168
|
| 259 |
+
[2026-01-08 20:06:55 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.878481s, mse: 0.21032479
|
| 260 |
+
[2026-01-08 20:06:59 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.885566s, mse: 0.20987187
|
| 261 |
+
[2026-01-08 20:07:03 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.890397s, mse: 0.20908046
|
| 262 |
+
[2026-01-08 20:07:06 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 3.880580s, mse: 0.20848191
|
| 263 |
+
[2026-01-08 20:07:10 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 3.883235s, mse: 0.20800886
|
| 264 |
+
[2026-01-08 20:07:14 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 3.877982s, mse: 0.20795538
|
| 265 |
+
[2026-01-08 20:07:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 266 |
+
[2026-01-08 20:07:15 root] (train_utils.py 108): INFO ========= Layer 12 =========
|
| 267 |
+
[2026-01-08 20:07:23 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 5.431174s, mse: 0.43535280
|
| 268 |
+
[2026-01-08 20:07:27 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 3.955285s, mse: 0.29579335
|
| 269 |
+
[2026-01-08 20:07:31 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 3.883549s, mse: 0.24488190
|
| 270 |
+
[2026-01-08 20:07:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 3.886171s, mse: 0.23438135
|
| 271 |
+
[2026-01-08 20:07:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 3.883848s, mse: 0.23133603
|
| 272 |
+
[2026-01-08 20:07:42 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 3.884988s, mse: 0.22933656
|
| 273 |
+
[2026-01-08 20:07:46 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 3.878419s, mse: 0.22804067
|
| 274 |
+
[2026-01-08 20:07:50 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 3.877913s, mse: 0.22690852
|
| 275 |
+
[2026-01-08 20:07:54 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.882433s, mse: 0.22579126
|
| 276 |
+
[2026-01-08 20:07:58 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.878513s, mse: 0.22475064
|
| 277 |
+
[2026-01-08 20:08:02 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.881245s, mse: 0.22366890
|
| 278 |
+
[2026-01-08 20:08:06 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.883585s, mse: 0.22277188
|
| 279 |
+
[2026-01-08 20:08:10 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.883076s, mse: 0.22196589
|
| 280 |
+
[2026-01-08 20:08:13 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.877839s, mse: 0.22144113
|
| 281 |
+
[2026-01-08 20:08:17 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.886441s, mse: 0.22116731
|
| 282 |
+
[2026-01-08 20:08:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 283 |
+
[2026-01-08 20:08:19 root] (train_utils.py 108): INFO ========= Layer 13 =========
|
| 284 |
+
[2026-01-08 20:08:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 5.372051s, mse: 0.44991863
|
| 285 |
+
[2026-01-08 20:08:30 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.877190s, mse: 0.30773303
|
| 286 |
+
[2026-01-08 20:08:34 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.877676s, mse: 0.25602528
|
| 287 |
+
[2026-01-08 20:08:38 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.887986s, mse: 0.24593170
|
| 288 |
+
[2026-01-08 20:08:42 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.881023s, mse: 0.24332635
|
| 289 |
+
[2026-01-08 20:08:46 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.883145s, mse: 0.24169515
|
| 290 |
+
[2026-01-08 20:08:50 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.875900s, mse: 0.24032030
|
| 291 |
+
[2026-01-08 20:08:54 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.880186s, mse: 0.23895445
|
| 292 |
+
[2026-01-08 20:08:58 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.879497s, mse: 0.23795472
|
| 293 |
+
[2026-01-08 20:09:01 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.878269s, mse: 0.23691620
|
| 294 |
+
[2026-01-08 20:09:05 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.893473s, mse: 0.23617835
|
| 295 |
+
[2026-01-08 20:09:09 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 3.880821s, mse: 0.23538260
|
| 296 |
+
[2026-01-08 20:09:13 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 3.883802s, mse: 0.23459788
|
| 297 |
+
[2026-01-08 20:09:17 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 3.885744s, mse: 0.23386008
|
| 298 |
+
[2026-01-08 20:09:21 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 3.877215s, mse: 0.23347831
|
| 299 |
+
[2026-01-08 20:09:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 300 |
+
[2026-01-08 20:09:22 root] (train_utils.py 108): INFO ========= Layer 14 =========
|
| 301 |
+
[2026-01-08 20:09:30 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 4.990215s, mse: 0.48670265
|
| 302 |
+
[2026-01-08 20:09:33 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.876076s, mse: 0.32924685
|
| 303 |
+
[2026-01-08 20:09:37 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 3.887783s, mse: 0.27174610
|
| 304 |
+
[2026-01-08 20:09:41 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 3.876861s, mse: 0.26111004
|
| 305 |
+
[2026-01-08 20:09:45 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 3.882962s, mse: 0.25857583
|
| 306 |
+
[2026-01-08 20:09:49 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 3.906216s, mse: 0.25724220
|
| 307 |
+
[2026-01-08 20:09:53 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 3.876176s, mse: 0.25530052
|
| 308 |
+
[2026-01-08 20:09:57 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 3.885303s, mse: 0.25373703
|
| 309 |
+
[2026-01-08 20:10:01 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.876806s, mse: 0.25232333
|
| 310 |
+
[2026-01-08 20:10:04 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 3.882736s, mse: 0.25103748
|
| 311 |
+
[2026-01-08 20:10:08 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 3.881382s, mse: 0.24987648
|
| 312 |
+
[2026-01-08 20:10:12 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 3.883449s, mse: 0.24912813
|
| 313 |
+
[2026-01-08 20:10:16 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 3.880871s, mse: 0.24813016
|
| 314 |
+
[2026-01-08 20:10:20 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 3.890360s, mse: 0.24762598
|
| 315 |
+
[2026-01-08 20:10:24 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.888802s, mse: 0.24739194
|
| 316 |
+
[2026-01-08 20:10:24 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 317 |
+
[2026-01-08 20:10:25 root] (train_utils.py 108): INFO ========= Layer 15 =========
|
| 318 |
+
[2026-01-08 20:10:33 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 4.986362s, mse: 0.48941827
|
| 319 |
+
[2026-01-08 20:10:37 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 3.880157s, mse: 0.32720220
|
| 320 |
+
[2026-01-08 20:10:41 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 3.880608s, mse: 0.26854873
|
| 321 |
+
[2026-01-08 20:10:44 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 3.880978s, mse: 0.25705975
|
| 322 |
+
[2026-01-08 20:10:48 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 3.878423s, mse: 0.25422159
|
| 323 |
+
[2026-01-08 20:10:52 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 3.891015s, mse: 0.25197345
|
| 324 |
+
[2026-01-08 20:10:56 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 3.883527s, mse: 0.25026903
|
| 325 |
+
[2026-01-08 20:11:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.874571s, mse: 0.24867499
|
| 326 |
+
[2026-01-08 20:11:04 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 3.883435s, mse: 0.24771519
|
| 327 |
+
[2026-01-08 20:11:08 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 3.882031s, mse: 0.24665023
|
| 328 |
+
[2026-01-08 20:11:12 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 3.881983s, mse: 0.24558856
|
| 329 |
+
[2026-01-08 20:11:15 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 3.877608s, mse: 0.24435455
|
| 330 |
+
[2026-01-08 20:11:19 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 3.889889s, mse: 0.24346027
|
| 331 |
+
[2026-01-08 20:11:23 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 3.883180s, mse: 0.24292424
|
| 332 |
+
[2026-01-08 20:11:27 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 3.884431s, mse: 0.24260354
|
| 333 |
+
[2026-01-08 20:11:28 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 334 |
+
[2026-01-08 20:11:28 root] (train_utils.py 108): INFO ========= Layer 16 =========
|
| 335 |
+
[2026-01-08 20:11:36 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 4.840796s, mse: 3.09758520
|
| 336 |
+
[2026-01-08 20:11:39 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.900541s, mse: 1.53681600
|
| 337 |
+
[2026-01-08 20:11:43 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.880463s, mse: 1.37538433
|
| 338 |
+
[2026-01-08 20:11:47 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.883470s, mse: 1.14041376
|
| 339 |
+
[2026-01-08 20:11:51 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 3.874584s, mse: 1.13041377
|
| 340 |
+
[2026-01-08 20:11:55 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 3.879631s, mse: 1.17505825
|
| 341 |
+
[2026-01-08 20:11:59 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 3.878177s, mse: 1.00187659
|
| 342 |
+
[2026-01-08 20:12:03 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 3.875633s, mse: 1.15916288
|
| 343 |
+
[2026-01-08 20:12:07 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 3.881597s, mse: 0.93556213
|
| 344 |
+
[2026-01-08 20:12:11 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 3.873534s, mse: 0.89307052
|
| 345 |
+
[2026-01-08 20:12:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.875691s, mse: 1.08854449
|
| 346 |
+
[2026-01-08 20:12:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.883201s, mse: 0.78587675
|
| 347 |
+
[2026-01-08 20:12:22 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.879601s, mse: 0.77024889
|
| 348 |
+
[2026-01-08 20:12:26 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.882570s, mse: 0.74143833
|
| 349 |
+
[2026-01-08 20:12:30 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.885281s, mse: 0.62904388
|
| 350 |
+
[2026-01-08 20:12:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 351 |
+
[2026-01-08 20:12:31 root] (train_utils.py 108): INFO ========= Layer 17 =========
|
| 352 |
+
[2026-01-08 20:12:39 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 5.544866s, mse: 0.57632238
|
| 353 |
+
[2026-01-08 20:12:43 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 3.915652s, mse: 0.38568184
|
| 354 |
+
[2026-01-08 20:12:47 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 3.885339s, mse: 0.30990756
|
| 355 |
+
[2026-01-08 20:12:51 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 3.893072s, mse: 0.29348093
|
| 356 |
+
[2026-01-08 20:12:55 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 3.887810s, mse: 0.28841209
|
| 357 |
+
[2026-01-08 20:12:59 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 3.886663s, mse: 0.28536177
|
| 358 |
+
[2026-01-08 20:13:02 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 3.891812s, mse: 0.28336507
|
| 359 |
+
[2026-01-08 20:13:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 3.885844s, mse: 0.28023016
|
| 360 |
+
[2026-01-08 20:13:10 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 3.883247s, mse: 0.27797151
|
| 361 |
+
[2026-01-08 20:13:14 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 3.882213s, mse: 0.27724716
|
| 362 |
+
[2026-01-08 20:13:18 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.885559s, mse: 0.27549568
|
| 363 |
+
[2026-01-08 20:13:22 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.885018s, mse: 0.27411795
|
| 364 |
+
[2026-01-08 20:13:26 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.889595s, mse: 0.27230272
|
| 365 |
+
[2026-01-08 20:13:30 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.888286s, mse: 0.27161792
|
| 366 |
+
[2026-01-08 20:13:34 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 3.888104s, mse: 0.27142629
|
| 367 |
+
[2026-01-08 20:13:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 368 |
+
[2026-01-08 20:13:34 root] (train_utils.py 108): INFO ========= Layer 18 =========
|
| 369 |
+
[2026-01-08 20:13:42 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 4.824618s, mse: 0.68219566
|
| 370 |
+
[2026-01-08 20:13:46 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 3.956999s, mse: 0.44933167
|
| 371 |
+
[2026-01-08 20:13:50 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 4.207351s, mse: 0.36149144
|
| 372 |
+
[2026-01-08 20:13:54 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 4.268250s, mse: 0.34437451
|
| 373 |
+
[2026-01-08 20:13:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 4.559022s, mse: 0.33928376
|
| 374 |
+
[2026-01-08 20:14:02 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 3.898552s, mse: 0.33628541
|
| 375 |
+
[2026-01-08 20:14:06 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 3.896710s, mse: 0.33380261
|
| 376 |
+
[2026-01-08 20:14:10 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 3.873617s, mse: 0.33132178
|
| 377 |
+
[2026-01-08 20:14:14 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 3.874515s, mse: 0.32943395
|
| 378 |
+
[2026-01-08 20:14:18 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 3.881486s, mse: 0.32786560
|
| 379 |
+
[2026-01-08 20:14:22 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.874159s, mse: 0.32583937
|
| 380 |
+
[2026-01-08 20:14:26 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.872985s, mse: 0.32450172
|
| 381 |
+
[2026-01-08 20:14:30 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.881658s, mse: 0.32264820
|
| 382 |
+
[2026-01-08 20:14:34 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.877914s, mse: 0.32187557
|
| 383 |
+
[2026-01-08 20:14:37 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.878186s, mse: 0.32105669
|
| 384 |
+
[2026-01-08 20:14:38 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 385 |
+
[2026-01-08 20:14:38 root] (train_utils.py 108): INFO ========= Layer 19 =========
|
| 386 |
+
[2026-01-08 20:14:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 4.583254s, mse: 0.88728219
|
| 387 |
+
[2026-01-08 20:14:49 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 3.886281s, mse: 0.57078516
|
| 388 |
+
[2026-01-08 20:14:53 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 3.872338s, mse: 0.45792666
|
| 389 |
+
[2026-01-08 20:14:57 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.876560s, mse: 0.43537480
|
| 390 |
+
[2026-01-08 20:15:01 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.874999s, mse: 0.42894897
|
| 391 |
+
[2026-01-08 20:15:05 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.882475s, mse: 0.42462113
|
| 392 |
+
[2026-01-08 20:15:09 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.877281s, mse: 0.42157629
|
| 393 |
+
[2026-01-08 20:15:13 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.878093s, mse: 0.41864219
|
| 394 |
+
[2026-01-08 20:15:16 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.882066s, mse: 0.41570342
|
| 395 |
+
[2026-01-08 20:15:20 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.872604s, mse: 0.41345572
|
| 396 |
+
[2026-01-08 20:15:24 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.875411s, mse: 0.41054672
|
| 397 |
+
[2026-01-08 20:15:28 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.879127s, mse: 0.40846488
|
| 398 |
+
[2026-01-08 20:15:32 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.876599s, mse: 0.40727249
|
| 399 |
+
[2026-01-08 20:15:36 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.881322s, mse: 0.40628025
|
| 400 |
+
[2026-01-08 20:15:40 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 3.875526s, mse: 0.40573606
|
| 401 |
+
[2026-01-08 20:15:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 402 |
+
[2026-01-08 20:15:41 root] (train_utils.py 108): INFO ========= Layer 20 =========
|
| 403 |
+
[2026-01-08 20:15:49 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 4.995933s, mse: 0.88836050
|
| 404 |
+
[2026-01-08 20:15:53 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 3.873272s, mse: 0.59483135
|
| 405 |
+
[2026-01-08 20:15:56 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 3.875340s, mse: 0.48579982
|
| 406 |
+
[2026-01-08 20:16:00 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.876288s, mse: 0.46583182
|
| 407 |
+
[2026-01-08 20:16:04 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 3.881600s, mse: 0.46044937
|
| 408 |
+
[2026-01-08 20:16:08 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 3.869276s, mse: 0.45749170
|
| 409 |
+
[2026-01-08 20:16:12 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 3.882871s, mse: 0.45316568
|
| 410 |
+
[2026-01-08 20:16:16 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 3.873422s, mse: 0.45053339
|
| 411 |
+
[2026-01-08 20:16:20 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 3.871368s, mse: 0.44832462
|
| 412 |
+
[2026-01-08 20:16:24 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 3.892880s, mse: 0.44616416
|
| 413 |
+
[2026-01-08 20:16:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.879390s, mse: 0.44334349
|
| 414 |
+
[2026-01-08 20:16:31 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 3.874708s, mse: 0.44204527
|
| 415 |
+
[2026-01-08 20:16:35 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 3.879157s, mse: 0.43987796
|
| 416 |
+
[2026-01-08 20:16:39 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 3.923136s, mse: 0.43863490
|
| 417 |
+
[2026-01-08 20:16:43 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 3.887576s, mse: 0.43791217
|
| 418 |
+
[2026-01-08 20:16:43 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 419 |
+
[2026-01-08 20:16:44 root] (train_utils.py 108): INFO ========= Layer 21 =========
|
| 420 |
+
[2026-01-08 20:16:52 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.168227s, mse: 1.18043423
|
| 421 |
+
[2026-01-08 20:16:56 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.969011s, mse: 0.77954561
|
| 422 |
+
[2026-01-08 20:17:00 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 3.886918s, mse: 0.64111829
|
| 423 |
+
[2026-01-08 20:17:04 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 3.872880s, mse: 0.61397409
|
| 424 |
+
[2026-01-08 20:17:08 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 3.885565s, mse: 0.60631013
|
| 425 |
+
[2026-01-08 20:17:11 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 3.888389s, mse: 0.60047567
|
| 426 |
+
[2026-01-08 20:17:15 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 3.884962s, mse: 0.59512597
|
| 427 |
+
[2026-01-08 20:17:19 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 3.891089s, mse: 0.59215677
|
| 428 |
+
[2026-01-08 20:17:23 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 3.884314s, mse: 0.58796024
|
| 429 |
+
[2026-01-08 20:17:27 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 3.872022s, mse: 0.58513182
|
| 430 |
+
[2026-01-08 20:17:31 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.889990s, mse: 0.58225924
|
| 431 |
+
[2026-01-08 20:17:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 3.887443s, mse: 0.57988369
|
| 432 |
+
[2026-01-08 20:17:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 3.889952s, mse: 0.57718277
|
| 433 |
+
[2026-01-08 20:17:43 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 3.880646s, mse: 0.57546204
|
| 434 |
+
[2026-01-08 20:17:46 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 3.886966s, mse: 0.57469940
|
| 435 |
+
[2026-01-08 20:17:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 436 |
+
[2026-01-08 20:17:48 root] (train_utils.py 108): INFO ========= Layer 22 =========
|
| 437 |
+
[2026-01-08 20:17:55 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.094079s, mse: 1.88664389
|
| 438 |
+
[2026-01-08 20:17:59 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 3.885375s, mse: 1.18959606
|
| 439 |
+
[2026-01-08 20:18:03 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 3.886684s, mse: 0.95907360
|
| 440 |
+
[2026-01-08 20:18:07 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.882739s, mse: 0.91428280
|
| 441 |
+
[2026-01-08 20:18:11 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.883721s, mse: 0.90376323
|
| 442 |
+
[2026-01-08 20:18:15 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.892156s, mse: 0.89363086
|
| 443 |
+
[2026-01-08 20:18:19 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 3.886229s, mse: 0.88751125
|
| 444 |
+
[2026-01-08 20:18:23 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 3.886792s, mse: 0.87932986
|
| 445 |
+
[2026-01-08 20:18:26 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 3.887209s, mse: 0.87506205
|
| 446 |
+
[2026-01-08 20:18:30 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 3.894486s, mse: 0.86960399
|
| 447 |
+
[2026-01-08 20:18:34 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 3.895200s, mse: 0.86433518
|
| 448 |
+
[2026-01-08 20:18:38 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 3.897787s, mse: 0.85831034
|
| 449 |
+
[2026-01-08 20:18:42 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.895077s, mse: 0.85434479
|
| 450 |
+
[2026-01-08 20:18:46 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.885206s, mse: 0.85274106
|
| 451 |
+
[2026-01-08 20:18:50 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.900901s, mse: 0.85105854
|
| 452 |
+
[2026-01-08 20:18:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 453 |
+
[2026-01-08 20:18:51 root] (train_utils.py 108): INFO ========= Layer 23 =========
|
| 454 |
+
[2026-01-08 20:18:59 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 5.249717s, mse: 2.56160784
|
| 455 |
+
[2026-01-08 20:19:03 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 3.885572s, mse: 1.69400561
|
| 456 |
+
[2026-01-08 20:19:07 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 3.884498s, mse: 1.40092814
|
| 457 |
+
[2026-01-08 20:19:11 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 3.870114s, mse: 1.33960748
|
| 458 |
+
[2026-01-08 20:19:14 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 3.876960s, mse: 1.31923652
|
| 459 |
+
[2026-01-08 20:19:18 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 3.886075s, mse: 1.30260742
|
| 460 |
+
[2026-01-08 20:19:22 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 3.886401s, mse: 1.29341400
|
| 461 |
+
[2026-01-08 20:19:26 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 3.880126s, mse: 1.28473794
|
| 462 |
+
[2026-01-08 20:19:30 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 3.874241s, mse: 1.27725101
|
| 463 |
+
[2026-01-08 20:19:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 3.873134s, mse: 1.27071691
|
| 464 |
+
[2026-01-08 20:19:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 3.887421s, mse: 1.26552820
|
| 465 |
+
[2026-01-08 20:19:42 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.872315s, mse: 1.26018000
|
| 466 |
+
[2026-01-08 20:19:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.885125s, mse: 1.25696874
|
| 467 |
+
[2026-01-08 20:19:49 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.875618s, mse: 1.25348544
|
| 468 |
+
[2026-01-08 20:19:53 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 3.879704s, mse: 1.25113153
|
| 469 |
+
[2026-01-08 20:19:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 470 |
+
[2026-01-08 20:19:55 root] (train_utils.py 108): INFO ========= Layer 24 =========
|
| 471 |
+
[2026-01-08 20:20:02 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 5.248398s, mse: 3.33080626
|
| 472 |
+
[2026-01-08 20:20:06 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 3.880547s, mse: 2.21739531
|
| 473 |
+
[2026-01-08 20:20:10 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 3.877884s, mse: 1.83558488
|
| 474 |
+
[2026-01-08 20:20:14 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 3.877035s, mse: 1.75192118
|
| 475 |
+
[2026-01-08 20:20:18 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 3.878414s, mse: 1.73021388
|
| 476 |
+
[2026-01-08 20:20:22 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 3.876617s, mse: 1.70965135
|
| 477 |
+
[2026-01-08 20:20:26 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 3.885743s, mse: 1.69753647
|
| 478 |
+
[2026-01-08 20:20:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 3.880893s, mse: 1.68364048
|
| 479 |
+
[2026-01-08 20:20:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 3.879582s, mse: 1.67123342
|
| 480 |
+
[2026-01-08 20:20:37 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 3.884786s, mse: 1.66224420
|
| 481 |
+
[2026-01-08 20:20:41 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 3.881520s, mse: 1.65476453
|
| 482 |
+
[2026-01-08 20:20:45 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.883099s, mse: 1.64498436
|
| 483 |
+
[2026-01-08 20:20:49 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 4.338018s, mse: 1.63647079
|
| 484 |
+
[2026-01-08 20:20:54 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 4.411990s, mse: 1.63291585
|
| 485 |
+
[2026-01-08 20:20:58 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 4.227370s, mse: 1.63007939
|
| 486 |
+
[2026-01-08 20:20:59 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 487 |
+
[2026-01-08 20:21:00 root] (train_utils.py 108): INFO ========= Layer 25 =========
|
| 488 |
+
[2026-01-08 20:21:07 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 4.700488s, mse: 3.67945337
|
| 489 |
+
[2026-01-08 20:21:11 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.908360s, mse: 2.39840055
|
| 490 |
+
[2026-01-08 20:21:15 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.906975s, mse: 2.00158238
|
| 491 |
+
[2026-01-08 20:21:19 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.874880s, mse: 1.92655563
|
| 492 |
+
[2026-01-08 20:21:22 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.876819s, mse: 1.90741169
|
| 493 |
+
[2026-01-08 20:21:26 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.876056s, mse: 1.89064825
|
| 494 |
+
[2026-01-08 20:21:30 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.879740s, mse: 1.88254857
|
| 495 |
+
[2026-01-08 20:21:34 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.875947s, mse: 1.87189174
|
| 496 |
+
[2026-01-08 20:21:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 3.891793s, mse: 1.86226833
|
| 497 |
+
[2026-01-08 20:21:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.879055s, mse: 1.85414529
|
| 498 |
+
[2026-01-08 20:21:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.878680s, mse: 1.84632003
|
| 499 |
+
[2026-01-08 20:21:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.879498s, mse: 1.83962476
|
| 500 |
+
[2026-01-08 20:21:53 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.880664s, mse: 1.83272731
|
| 501 |
+
[2026-01-08 20:21:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 4.080587s, mse: 1.83188641
|
| 502 |
+
[2026-01-08 20:22:01 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 3.874877s, mse: 1.82856822
|
| 503 |
+
[2026-01-08 20:22:02 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 504 |
+
[2026-01-08 20:22:03 root] (train_utils.py 108): INFO ========= Layer 26 =========
|
| 505 |
+
[2026-01-08 20:22:11 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.800936s, mse: 4.35819054
|
| 506 |
+
[2026-01-08 20:22:15 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 3.951150s, mse: 2.94494462
|
| 507 |
+
[2026-01-08 20:22:19 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 3.874370s, mse: 2.46222878
|
| 508 |
+
[2026-01-08 20:22:23 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 3.879827s, mse: 2.36697221
|
| 509 |
+
[2026-01-08 20:22:27 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.873295s, mse: 2.34871936
|
| 510 |
+
[2026-01-08 20:22:31 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 3.881480s, mse: 2.33013940
|
| 511 |
+
[2026-01-08 20:22:34 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 3.875904s, mse: 2.31725478
|
| 512 |
+
[2026-01-08 20:22:38 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 3.879355s, mse: 2.30295658
|
| 513 |
+
[2026-01-08 20:22:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 3.883507s, mse: 2.29171467
|
| 514 |
+
[2026-01-08 20:22:46 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 3.878979s, mse: 2.28112888
|
| 515 |
+
[2026-01-08 20:22:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.874169s, mse: 2.27260423
|
| 516 |
+
[2026-01-08 20:22:54 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.872427s, mse: 2.26187754
|
| 517 |
+
[2026-01-08 20:22:58 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 3.878131s, mse: 2.25517917
|
| 518 |
+
[2026-01-08 20:23:02 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 3.877979s, mse: 2.24800634
|
| 519 |
+
[2026-01-08 20:23:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 3.879482s, mse: 2.24403787
|
| 520 |
+
[2026-01-08 20:23:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 521 |
+
[2026-01-08 20:23:07 root] (train_utils.py 108): INFO ========= Layer 27 =========
|
| 522 |
+
[2026-01-08 20:23:16 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 5.418776s, mse: 5.94560862
|
| 523 |
+
[2026-01-08 20:23:19 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.875828s, mse: 3.95834851
|
| 524 |
+
[2026-01-08 20:23:23 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 3.876054s, mse: 3.32281756
|
| 525 |
+
[2026-01-08 20:23:27 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 3.876422s, mse: 3.18086267
|
| 526 |
+
[2026-01-08 20:23:31 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 3.874758s, mse: 3.14467168
|
| 527 |
+
[2026-01-08 20:23:35 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 3.882139s, mse: 3.12000346
|
| 528 |
+
[2026-01-08 20:23:39 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 3.877449s, mse: 3.09776139
|
| 529 |
+
[2026-01-08 20:23:43 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 3.880479s, mse: 3.07834363
|
| 530 |
+
[2026-01-08 20:23:47 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 3.899203s, mse: 3.06277657
|
| 531 |
+
[2026-01-08 20:23:50 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 3.879466s, mse: 3.04591680
|
| 532 |
+
[2026-01-08 20:23:54 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.876263s, mse: 3.03134632
|
| 533 |
+
[2026-01-08 20:23:58 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 3.881885s, mse: 3.01916480
|
| 534 |
+
[2026-01-08 20:24:02 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 3.881700s, mse: 3.00719571
|
| 535 |
+
[2026-01-08 20:24:06 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 3.874153s, mse: 2.99984956
|
| 536 |
+
[2026-01-08 20:24:10 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 3.875395s, mse: 2.99120903
|
| 537 |
+
[2026-01-08 20:24:10 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 538 |
+
[2026-01-08 20:24:11 root] (train_utils.py 108): INFO ========= Layer 28 =========
|
| 539 |
+
[2026-01-08 20:24:19 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 4.966117s, mse: 8.40579605
|
| 540 |
+
[2026-01-08 20:24:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 3.873255s, mse: 5.55529737
|
| 541 |
+
[2026-01-08 20:24:26 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 3.885448s, mse: 4.64479589
|
| 542 |
+
[2026-01-08 20:24:30 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 3.878376s, mse: 4.46341419
|
| 543 |
+
[2026-01-08 20:24:34 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.879312s, mse: 4.40386772
|
| 544 |
+
[2026-01-08 20:24:38 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.882168s, mse: 4.37245226
|
| 545 |
+
[2026-01-08 20:24:42 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 3.881505s, mse: 4.34240580
|
| 546 |
+
[2026-01-08 20:24:46 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 3.881425s, mse: 4.31763363
|
| 547 |
+
[2026-01-08 20:24:50 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 3.878464s, mse: 4.29854107
|
| 548 |
+
[2026-01-08 20:24:54 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 3.876505s, mse: 4.28071547
|
| 549 |
+
[2026-01-08 20:24:57 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 3.881535s, mse: 4.26679897
|
| 550 |
+
[2026-01-08 20:25:01 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 3.877203s, mse: 4.24268007
|
| 551 |
+
[2026-01-08 20:25:05 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 3.876781s, mse: 4.22641373
|
| 552 |
+
[2026-01-08 20:25:09 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.874362s, mse: 4.22128248
|
| 553 |
+
[2026-01-08 20:25:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.876956s, mse: 4.21494389
|
| 554 |
+
[2026-01-08 20:25:13 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 555 |
+
[2026-01-08 20:25:14 root] (train_utils.py 108): INFO ========= Layer 29 =========
|
| 556 |
+
[2026-01-08 20:25:21 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 4.975133s, mse: 10.38746834
|
| 557 |
+
[2026-01-08 20:25:25 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 3.876715s, mse: 7.14648628
|
| 558 |
+
[2026-01-08 20:25:29 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 3.880992s, mse: 6.03318691
|
| 559 |
+
[2026-01-08 20:25:33 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 3.880460s, mse: 5.78764057
|
| 560 |
+
[2026-01-08 20:25:37 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 3.886153s, mse: 5.71550655
|
| 561 |
+
[2026-01-08 20:25:41 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 3.883043s, mse: 5.66473246
|
| 562 |
+
[2026-01-08 20:25:45 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 3.884337s, mse: 5.61916113
|
| 563 |
+
[2026-01-08 20:25:49 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 3.878447s, mse: 5.58458805
|
| 564 |
+
[2026-01-08 20:25:52 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 3.873308s, mse: 5.54784393
|
| 565 |
+
[2026-01-08 20:25:56 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 3.882202s, mse: 5.52231646
|
| 566 |
+
[2026-01-08 20:26:00 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 3.881016s, mse: 5.48976994
|
| 567 |
+
[2026-01-08 20:26:04 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.876146s, mse: 5.46507311
|
| 568 |
+
[2026-01-08 20:26:08 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.877000s, mse: 5.44575977
|
| 569 |
+
[2026-01-08 20:26:12 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 3.881991s, mse: 5.43577242
|
| 570 |
+
[2026-01-08 20:26:16 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 3.874315s, mse: 5.42604542
|
| 571 |
+
[2026-01-08 20:26:16 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 572 |
+
[2026-01-08 20:26:17 root] (train_utils.py 108): INFO ========= Layer 30 =========
|
| 573 |
+
[2026-01-08 20:26:25 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 5.156940s, mse: 16.29405975
|
| 574 |
+
[2026-01-08 20:26:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 3.891409s, mse: 11.01632500
|
| 575 |
+
[2026-01-08 20:26:32 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 3.874850s, mse: 9.27882481
|
| 576 |
+
[2026-01-08 20:26:36 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 3.885643s, mse: 8.87542439
|
| 577 |
+
[2026-01-08 20:26:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 3.875562s, mse: 8.75351048
|
| 578 |
+
[2026-01-08 20:26:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 3.872500s, mse: 8.65880680
|
| 579 |
+
[2026-01-08 20:26:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 3.884465s, mse: 8.60634327
|
| 580 |
+
[2026-01-08 20:26:52 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 3.870189s, mse: 8.53597736
|
| 581 |
+
[2026-01-08 20:26:56 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 3.875866s, mse: 8.50352001
|
| 582 |
+
[2026-01-08 20:27:00 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 3.874484s, mse: 8.44190311
|
| 583 |
+
[2026-01-08 20:27:03 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.877156s, mse: 8.40491486
|
| 584 |
+
[2026-01-08 20:27:07 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.874284s, mse: 8.38511753
|
| 585 |
+
[2026-01-08 20:27:11 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.874029s, mse: 8.35692787
|
| 586 |
+
[2026-01-08 20:27:15 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.880367s, mse: 8.35674667
|
| 587 |
+
[2026-01-08 20:27:19 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.877804s, mse: 8.34408569
|
| 588 |
+
[2026-01-08 20:27:19 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 589 |
+
[2026-01-08 20:27:20 root] (train_utils.py 108): INFO ========= Layer 31 =========
|
| 590 |
+
[2026-01-08 20:27:29 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 5.676542s, mse: 20.78250885
|
| 591 |
+
[2026-01-08 20:27:33 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 3.947701s, mse: 14.37235165
|
| 592 |
+
[2026-01-08 20:27:36 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.875565s, mse: 12.13233566
|
| 593 |
+
[2026-01-08 20:27:40 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.872178s, mse: 11.62570667
|
| 594 |
+
[2026-01-08 20:27:44 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.876312s, mse: 11.51362991
|
| 595 |
+
[2026-01-08 20:27:48 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.880129s, mse: 11.42485142
|
| 596 |
+
[2026-01-08 20:27:52 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.876696s, mse: 11.33607769
|
| 597 |
+
[2026-01-08 20:27:56 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.879226s, mse: 11.27843571
|
| 598 |
+
[2026-01-08 20:28:00 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.872689s, mse: 11.22037888
|
| 599 |
+
[2026-01-08 20:28:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.875676s, mse: 11.15839195
|
| 600 |
+
[2026-01-08 20:28:07 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.871733s, mse: 11.12734127
|
| 601 |
+
[2026-01-08 20:28:11 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.876626s, mse: 11.08810806
|
| 602 |
+
[2026-01-08 20:28:15 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.878724s, mse: 11.05513668
|
| 603 |
+
[2026-01-08 20:28:19 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 3.879174s, mse: 11.03436947
|
| 604 |
+
[2026-01-08 20:28:23 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 3.875401s, mse: 11.01393795
|
| 605 |
+
[2026-01-08 20:28:23 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 606 |
+
[2026-01-08 20:28:24 root] (train_utils.py 108): INFO ========= Layer 32 =========
|
| 607 |
+
[2026-01-08 20:28:32 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.014384s, mse: 28.37956429
|
| 608 |
+
[2026-01-08 20:28:36 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 3.874096s, mse: 19.76789856
|
| 609 |
+
[2026-01-08 20:28:39 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 3.879622s, mse: 16.61169624
|
| 610 |
+
[2026-01-08 20:28:43 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.878350s, mse: 15.88970184
|
| 611 |
+
[2026-01-08 20:28:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 3.873660s, mse: 15.74769402
|
| 612 |
+
[2026-01-08 20:28:51 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 3.872789s, mse: 15.61922455
|
| 613 |
+
[2026-01-08 20:28:55 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 3.873571s, mse: 15.51004982
|
| 614 |
+
[2026-01-08 20:28:59 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 3.873817s, mse: 15.42904854
|
| 615 |
+
[2026-01-08 20:29:03 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 3.879045s, mse: 15.34880447
|
| 616 |
+
[2026-01-08 20:29:07 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.876587s, mse: 15.27359772
|
| 617 |
+
[2026-01-08 20:29:10 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.876626s, mse: 15.21441174
|
| 618 |
+
[2026-01-08 20:29:14 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 3.873835s, mse: 15.16252708
|
| 619 |
+
[2026-01-08 20:29:18 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 3.877970s, mse: 15.10843849
|
| 620 |
+
[2026-01-08 20:29:22 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 3.876159s, mse: 15.08382893
|
| 621 |
+
[2026-01-08 20:29:26 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 3.866788s, mse: 15.06546974
|
| 622 |
+
[2026-01-08 20:29:26 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 623 |
+
[2026-01-08 20:29:27 root] (train_utils.py 108): INFO ========= Layer 33 =========
|
| 624 |
+
[2026-01-08 20:29:35 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 5.076409s, mse: 41.54327011
|
| 625 |
+
[2026-01-08 20:29:39 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 3.883298s, mse: 27.93664551
|
| 626 |
+
[2026-01-08 20:29:43 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 3.877707s, mse: 23.32941628
|
| 627 |
+
[2026-01-08 20:29:47 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 3.931390s, mse: 22.34293175
|
| 628 |
+
[2026-01-08 20:29:51 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 3.876992s, mse: 22.07669640
|
| 629 |
+
[2026-01-08 20:29:54 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 3.892163s, mse: 21.87960243
|
| 630 |
+
[2026-01-08 20:29:58 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 3.882131s, mse: 21.73635674
|
| 631 |
+
[2026-01-08 20:30:02 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 3.886717s, mse: 21.58724403
|
| 632 |
+
[2026-01-08 20:30:06 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 3.884485s, mse: 21.46766853
|
| 633 |
+
[2026-01-08 20:30:10 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.888596s, mse: 21.36098099
|
| 634 |
+
[2026-01-08 20:30:14 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 3.884719s, mse: 21.27636719
|
| 635 |
+
[2026-01-08 20:30:18 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 3.906380s, mse: 21.16030693
|
| 636 |
+
[2026-01-08 20:30:22 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 3.878760s, mse: 21.07536125
|
| 637 |
+
[2026-01-08 20:30:26 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 3.876303s, mse: 20.99114990
|
| 638 |
+
[2026-01-08 20:30:29 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 3.879338s, mse: 20.95961761
|
| 639 |
+
[2026-01-08 20:30:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 640 |
+
[2026-01-08 20:30:31 root] (train_utils.py 108): INFO ========= Layer 34 =========
|
| 641 |
+
[2026-01-08 20:30:38 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 5.304806s, mse: 64.93594360
|
| 642 |
+
[2026-01-08 20:30:42 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 3.879078s, mse: 40.86461258
|
| 643 |
+
[2026-01-08 20:30:46 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 3.877936s, mse: 33.65349960
|
| 644 |
+
[2026-01-08 20:30:50 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.873914s, mse: 31.96302605
|
| 645 |
+
[2026-01-08 20:30:54 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.869976s, mse: 31.66926384
|
| 646 |
+
[2026-01-08 20:30:58 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 3.870930s, mse: 31.07656479
|
| 647 |
+
[2026-01-08 20:31:02 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 3.873630s, mse: 30.91048813
|
| 648 |
+
[2026-01-08 20:31:06 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 3.874344s, mse: 30.05115700
|
| 649 |
+
[2026-01-08 20:31:09 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 3.875742s, mse: 29.89023590
|
| 650 |
+
[2026-01-08 20:31:13 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 3.876966s, mse: 30.35319901
|
| 651 |
+
[2026-01-08 20:31:17 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 3.879142s, mse: 29.46559715
|
| 652 |
+
[2026-01-08 20:31:21 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 3.871687s, mse: 29.05239487
|
| 653 |
+
[2026-01-08 20:31:25 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.872833s, mse: 28.86521339
|
| 654 |
+
[2026-01-08 20:31:29 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.881875s, mse: 28.74409676
|
| 655 |
+
[2026-01-08 20:31:33 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 3.879118s, mse: 28.70412636
|
| 656 |
+
[2026-01-08 20:31:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 657 |
+
[2026-01-08 20:31:34 root] (train_utils.py 108): INFO ========= Layer 35 =========
|
| 658 |
+
[2026-01-08 20:31:42 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 5.142302s, mse: 108.25781250
|
| 659 |
+
[2026-01-08 20:31:46 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 3.879463s, mse: 38.04971313
|
| 660 |
+
[2026-01-08 20:31:49 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 3.874327s, mse: 31.63025665
|
| 661 |
+
[2026-01-08 20:31:53 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 3.874362s, mse: 29.21376991
|
| 662 |
+
[2026-01-08 20:31:57 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 3.884794s, mse: 28.19089508
|
| 663 |
+
[2026-01-08 20:32:01 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 3.873233s, mse: 28.40728760
|
| 664 |
+
[2026-01-08 20:32:05 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 3.876803s, mse: 27.74842644
|
| 665 |
+
[2026-01-08 20:32:09 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 3.873011s, mse: 27.13273811
|
| 666 |
+
[2026-01-08 20:32:13 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 3.873823s, mse: 26.53238487
|
| 667 |
+
[2026-01-08 20:32:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 3.874454s, mse: 26.14052200
|
| 668 |
+
[2026-01-08 20:32:20 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.885984s, mse: 25.63203621
|
| 669 |
+
[2026-01-08 20:32:24 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.874369s, mse: 25.35079384
|
| 670 |
+
[2026-01-08 20:32:28 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 3.883839s, mse: 25.21109390
|
| 671 |
+
[2026-01-08 20:32:32 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 3.875901s, mse: 24.95710945
|
| 672 |
+
[2026-01-08 20:32:36 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 3.888915s, mse: 24.85692596
|
| 673 |
+
[2026-01-08 20:32:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 674 |
+
[2026-01-08 20:33:19 root] (main.py 39): INFO Finished reparameterize model.
|
| 675 |
+
[2026-01-08 20:33:46 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
|
| 676 |
+
[2026-01-08 20:34:11 root] (flat_utils.py 204): INFO saved weights at ./outputs/Qwen3-8B/w4a4/exp
|
| 677 |
+
[2026-01-08 20:34:24 root] (main.py 60): INFO wikitext2
|
| 678 |
+
[2026-01-08 20:35:05 root] (main.py 69): INFO 10.271322250366211
|
| 679 |
+
[2026-01-08 20:35:05 root] (main.py 60): INFO c4
|
| 680 |
+
[2026-01-08 20:36:14 root] (main.py 69): INFO 16.169748306274414
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260109_092702.txt
ADDED
|
@@ -0,0 +1,680 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-09 09:27:02 root] (args_utils.py 168): INFO Arguments:
|
| 2 |
+
[2026-01-09 09:27:02 root] (args_utils.py 169): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': True,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': False,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-09 09:27:02 root] (args_utils.py 170): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-09 09:27:03 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-09 09:27:19 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-09 09:27:24 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-09 09:27:26 root] (train_utils.py 108): INFO ========= Layer 0 =========
|
| 63 |
+
[2026-01-09 09:27:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.354077s, mse: 0.01574295
|
| 64 |
+
[2026-01-09 09:27:38 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.918327s, mse: 0.01115426
|
| 65 |
+
[2026-01-09 09:27:42 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.884766s, mse: 0.00938093
|
| 66 |
+
[2026-01-09 09:27:46 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.885943s, mse: 0.00881439
|
| 67 |
+
[2026-01-09 09:27:50 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.880416s, mse: 0.00857142
|
| 68 |
+
[2026-01-09 09:27:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.886740s, mse: 0.00849318
|
| 69 |
+
[2026-01-09 09:27:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.884636s, mse: 0.00832680
|
| 70 |
+
[2026-01-09 09:28:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.883160s, mse: 0.00828776
|
| 71 |
+
[2026-01-09 09:28:05 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.903839s, mse: 0.00818714
|
| 72 |
+
[2026-01-09 09:28:09 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.949619s, mse: 0.00813103
|
| 73 |
+
[2026-01-09 09:28:13 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.970162s, mse: 0.00808381
|
| 74 |
+
[2026-01-09 09:28:17 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.953839s, mse: 0.00804329
|
| 75 |
+
[2026-01-09 09:28:21 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.935800s, mse: 0.00799941
|
| 76 |
+
[2026-01-09 09:28:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.936768s, mse: 0.00795571
|
| 77 |
+
[2026-01-09 09:28:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.936058s, mse: 0.00794016
|
| 78 |
+
[2026-01-09 09:28:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 79 |
+
[2026-01-09 09:28:30 root] (train_utils.py 108): INFO ========= Layer 1 =========
|
| 80 |
+
[2026-01-09 09:28:37 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.692473s, mse: 0.00892038
|
| 81 |
+
[2026-01-09 09:28:41 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.960003s, mse: 0.00479663
|
| 82 |
+
[2026-01-09 09:28:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.980257s, mse: 0.00384854
|
| 83 |
+
[2026-01-09 09:28:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.956233s, mse: 0.00355465
|
| 84 |
+
[2026-01-09 09:28:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.955152s, mse: 0.00343135
|
| 85 |
+
[2026-01-09 09:28:57 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.946210s, mse: 0.00337971
|
| 86 |
+
[2026-01-09 09:29:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.939211s, mse: 0.00336636
|
| 87 |
+
[2026-01-09 09:29:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 4.178168s, mse: 0.00329515
|
| 88 |
+
[2026-01-09 09:29:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 4.400898s, mse: 0.00326379
|
| 89 |
+
[2026-01-09 09:29:14 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 4.451300s, mse: 0.00321724
|
| 90 |
+
[2026-01-09 09:29:18 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 4.386121s, mse: 0.00316591
|
| 91 |
+
[2026-01-09 09:29:22 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 4.420070s, mse: 0.00313276
|
| 92 |
+
[2026-01-09 09:29:27 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 4.264460s, mse: 0.00310469
|
| 93 |
+
[2026-01-09 09:29:31 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.953069s, mse: 0.00308243
|
| 94 |
+
[2026-01-09 09:29:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 7.505043s, mse: 0.00306749
|
| 95 |
+
[2026-01-09 09:29:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 96 |
+
[2026-01-09 09:29:39 root] (train_utils.py 108): INFO ========= Layer 2 =========
|
| 97 |
+
[2026-01-09 09:29:52 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 8.480551s, mse: 0.01750460
|
| 98 |
+
[2026-01-09 09:29:59 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 7.587432s, mse: 0.00626545
|
| 99 |
+
[2026-01-09 09:30:06 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 6.716398s, mse: 0.00494380
|
| 100 |
+
[2026-01-09 09:30:10 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 4.318102s, mse: 0.00453308
|
| 101 |
+
[2026-01-09 09:30:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 7.168204s, mse: 0.00439964
|
| 102 |
+
[2026-01-09 09:30:25 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 7.189687s, mse: 0.00429795
|
| 103 |
+
[2026-01-09 09:30:32 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 7.178458s, mse: 0.00425246
|
| 104 |
+
[2026-01-09 09:30:39 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 7.220179s, mse: 0.00420888
|
| 105 |
+
[2026-01-09 09:30:44 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 5.011026s, mse: 0.00415287
|
| 106 |
+
[2026-01-09 09:30:48 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.906071s, mse: 0.00411024
|
| 107 |
+
[2026-01-09 09:30:55 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 7.158067s, mse: 0.00407672
|
| 108 |
+
[2026-01-09 09:31:02 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 7.198819s, mse: 0.00404750
|
| 109 |
+
[2026-01-09 09:31:09 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 7.203617s, mse: 0.00401742
|
| 110 |
+
[2026-01-09 09:31:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 7.287185s, mse: 0.00398090
|
| 111 |
+
[2026-01-09 09:31:22 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 5.615172s, mse: 0.00397130
|
| 112 |
+
[2026-01-09 09:31:23 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 113 |
+
[2026-01-09 09:31:23 root] (train_utils.py 108): INFO ========= Layer 3 =========
|
| 114 |
+
[2026-01-09 09:31:32 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 6.023239s, mse: 0.02308414
|
| 115 |
+
[2026-01-09 09:31:36 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.922480s, mse: 0.01333557
|
| 116 |
+
[2026-01-09 09:31:41 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 4.711427s, mse: 0.01099337
|
| 117 |
+
[2026-01-09 09:31:48 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 7.597089s, mse: 0.01028412
|
| 118 |
+
[2026-01-09 09:31:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 7.595494s, mse: 0.01000082
|
| 119 |
+
[2026-01-09 09:32:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 7.568671s, mse: 0.00980410
|
| 120 |
+
[2026-01-09 09:32:11 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 7.582738s, mse: 0.00969286
|
| 121 |
+
[2026-01-09 09:32:17 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 6.011546s, mse: 0.00956387
|
| 122 |
+
[2026-01-09 09:32:22 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 4.549473s, mse: 0.00946260
|
| 123 |
+
[2026-01-09 09:32:26 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 4.490500s, mse: 0.00937346
|
| 124 |
+
[2026-01-09 09:32:31 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 4.534710s, mse: 0.00926330
|
| 125 |
+
[2026-01-09 09:32:35 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 4.458297s, mse: 0.00916464
|
| 126 |
+
[2026-01-09 09:32:39 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.922243s, mse: 0.00907166
|
| 127 |
+
[2026-01-09 09:32:43 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.890109s, mse: 0.00904066
|
| 128 |
+
[2026-01-09 09:32:47 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.885668s, mse: 0.00900416
|
| 129 |
+
[2026-01-09 09:32:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 130 |
+
[2026-01-09 09:32:48 root] (train_utils.py 108): INFO ========= Layer 4 =========
|
| 131 |
+
[2026-01-09 09:32:59 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 8.274354s, mse: 0.06576648
|
| 132 |
+
[2026-01-09 09:33:07 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 8.427314s, mse: 0.03741666
|
| 133 |
+
[2026-01-09 09:33:15 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 8.378025s, mse: 0.03053248
|
| 134 |
+
[2026-01-09 09:33:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 8.409590s, mse: 0.02855516
|
| 135 |
+
[2026-01-09 09:33:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 8.404675s, mse: 0.02790034
|
| 136 |
+
[2026-01-09 09:33:39 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 6.628190s, mse: 0.02746365
|
| 137 |
+
[2026-01-09 09:33:43 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.947360s, mse: 0.02716962
|
| 138 |
+
[2026-01-09 09:33:47 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.893767s, mse: 0.02687641
|
| 139 |
+
[2026-01-09 09:33:51 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.886842s, mse: 0.02662238
|
| 140 |
+
[2026-01-09 09:33:56 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 5.446397s, mse: 0.02643147
|
| 141 |
+
[2026-01-09 09:34:04 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 8.440320s, mse: 0.02624781
|
| 142 |
+
[2026-01-09 09:34:13 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 8.405491s, mse: 0.02604026
|
| 143 |
+
[2026-01-09 09:34:21 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 8.418950s, mse: 0.02585863
|
| 144 |
+
[2026-01-09 09:34:30 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 8.423032s, mse: 0.02578292
|
| 145 |
+
[2026-01-09 09:34:38 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 8.420124s, mse: 0.02572995
|
| 146 |
+
[2026-01-09 09:34:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 147 |
+
[2026-01-09 09:34:39 root] (train_utils.py 108): INFO ========= Layer 5 =========
|
| 148 |
+
[2026-01-09 09:34:47 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.174283s, mse: 0.13743916
|
| 149 |
+
[2026-01-09 09:34:51 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 4.443467s, mse: 0.08057592
|
| 150 |
+
[2026-01-09 09:34:56 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 4.403272s, mse: 0.06617787
|
| 151 |
+
[2026-01-09 09:35:00 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 4.398644s, mse: 0.06287611
|
| 152 |
+
[2026-01-09 09:35:04 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 4.206020s, mse: 0.06213523
|
| 153 |
+
[2026-01-09 09:35:08 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.923909s, mse: 0.06160403
|
| 154 |
+
[2026-01-09 09:35:12 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.888113s, mse: 0.06119698
|
| 155 |
+
[2026-01-09 09:35:16 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.887599s, mse: 0.06094177
|
| 156 |
+
[2026-01-09 09:35:20 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.892878s, mse: 0.06060794
|
| 157 |
+
[2026-01-09 09:35:24 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.896571s, mse: 0.06020888
|
| 158 |
+
[2026-01-09 09:35:28 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.892194s, mse: 0.05995716
|
| 159 |
+
[2026-01-09 09:35:31 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.886368s, mse: 0.05978661
|
| 160 |
+
[2026-01-09 09:35:35 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.901675s, mse: 0.05955682
|
| 161 |
+
[2026-01-09 09:35:39 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.896321s, mse: 0.05938030
|
| 162 |
+
[2026-01-09 09:35:43 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.965801s, mse: 0.05934311
|
| 163 |
+
[2026-01-09 09:35:44 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 164 |
+
[2026-01-09 09:35:44 root] (train_utils.py 108): INFO ========= Layer 6 =========
|
| 165 |
+
[2026-01-09 09:35:51 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 4.703084s, mse: 1.86451793
|
| 166 |
+
[2026-01-09 09:35:55 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.884077s, mse: 0.35658583
|
| 167 |
+
[2026-01-09 09:35:59 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.885417s, mse: 0.32737118
|
| 168 |
+
[2026-01-09 09:36:03 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.887404s, mse: 0.28929594
|
| 169 |
+
[2026-01-09 09:36:07 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.931743s, mse: 0.24128482
|
| 170 |
+
[2026-01-09 09:36:11 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.886941s, mse: 0.21027605
|
| 171 |
+
[2026-01-09 09:36:15 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.900860s, mse: 0.25483868
|
| 172 |
+
[2026-01-09 09:36:19 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.881273s, mse: 0.23871142
|
| 173 |
+
[2026-01-09 09:36:22 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.888987s, mse: 0.21885920
|
| 174 |
+
[2026-01-09 09:36:26 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.890998s, mse: 0.20672695
|
| 175 |
+
[2026-01-09 09:36:30 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.887677s, mse: 0.20202750
|
| 176 |
+
[2026-01-09 09:36:34 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.885399s, mse: 0.17932597
|
| 177 |
+
[2026-01-09 09:36:38 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.888746s, mse: 0.20257902
|
| 178 |
+
[2026-01-09 09:36:42 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.932971s, mse: 0.20667967
|
| 179 |
+
[2026-01-09 09:36:46 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.969469s, mse: 0.16777667
|
| 180 |
+
[2026-01-09 09:36:46 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 181 |
+
[2026-01-09 09:36:47 root] (train_utils.py 108): INFO ========= Layer 7 =========
|
| 182 |
+
[2026-01-09 09:36:54 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 4.668406s, mse: 0.23462753
|
| 183 |
+
[2026-01-09 09:36:58 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.981027s, mse: 0.14976017
|
| 184 |
+
[2026-01-09 09:37:02 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.976696s, mse: 0.12312289
|
| 185 |
+
[2026-01-09 09:37:06 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.965072s, mse: 0.11779824
|
| 186 |
+
[2026-01-09 09:37:10 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.976833s, mse: 0.11621600
|
| 187 |
+
[2026-01-09 09:37:14 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.983180s, mse: 0.11538153
|
| 188 |
+
[2026-01-09 09:37:18 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.975625s, mse: 0.11461711
|
| 189 |
+
[2026-01-09 09:37:22 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.975661s, mse: 0.11396322
|
| 190 |
+
[2026-01-09 09:37:26 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.959342s, mse: 0.11346199
|
| 191 |
+
[2026-01-09 09:37:30 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.949084s, mse: 0.11303829
|
| 192 |
+
[2026-01-09 09:37:34 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 4.273962s, mse: 0.11244514
|
| 193 |
+
[2026-01-09 09:37:38 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 4.313030s, mse: 0.11193727
|
| 194 |
+
[2026-01-09 09:37:43 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 4.331037s, mse: 0.11167257
|
| 195 |
+
[2026-01-09 09:37:47 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 4.272573s, mse: 0.11139309
|
| 196 |
+
[2026-01-09 09:37:51 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 4.334657s, mse: 0.11127126
|
| 197 |
+
[2026-01-09 09:37:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 198 |
+
[2026-01-09 09:37:52 root] (train_utils.py 108): INFO ========= Layer 8 =========
|
| 199 |
+
[2026-01-09 09:38:00 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 4.868927s, mse: 0.31783378
|
| 200 |
+
[2026-01-09 09:38:04 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 4.520718s, mse: 0.21154313
|
| 201 |
+
[2026-01-09 09:38:12 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 7.594991s, mse: 0.17556834
|
| 202 |
+
[2026-01-09 09:38:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 7.589824s, mse: 0.16892871
|
| 203 |
+
[2026-01-09 09:38:27 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 7.583855s, mse: 0.16700211
|
| 204 |
+
[2026-01-09 09:38:35 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 7.586124s, mse: 0.16594610
|
| 205 |
+
[2026-01-09 09:38:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 5.884252s, mse: 0.16510613
|
| 206 |
+
[2026-01-09 09:38:45 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 4.319697s, mse: 0.16456470
|
| 207 |
+
[2026-01-09 09:38:52 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 7.242422s, mse: 0.16401851
|
| 208 |
+
[2026-01-09 09:38:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 7.228258s, mse: 0.16352586
|
| 209 |
+
[2026-01-09 09:39:06 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 7.222845s, mse: 0.16331530
|
| 210 |
+
[2026-01-09 09:39:14 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 7.226355s, mse: 0.16285881
|
| 211 |
+
[2026-01-09 09:39:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 4.796606s, mse: 0.16254890
|
| 212 |
+
[2026-01-09 09:39:23 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 4.818619s, mse: 0.16240378
|
| 213 |
+
[2026-01-09 09:39:31 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 7.213155s, mse: 0.16246043
|
| 214 |
+
[2026-01-09 09:39:31 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 215 |
+
[2026-01-09 09:39:31 root] (train_utils.py 108): INFO ========= Layer 9 =========
|
| 216 |
+
[2026-01-09 09:39:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 8.041505s, mse: 0.37875688
|
| 217 |
+
[2026-01-09 09:39:50 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 7.230130s, mse: 0.25363240
|
| 218 |
+
[2026-01-09 09:39:56 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 5.053484s, mse: 0.21064380
|
| 219 |
+
[2026-01-09 09:40:01 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 5.305918s, mse: 0.20179385
|
| 220 |
+
[2026-01-09 09:40:05 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 4.621566s, mse: 0.19936548
|
| 221 |
+
[2026-01-09 09:40:09 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.884829s, mse: 0.19817175
|
| 222 |
+
[2026-01-09 09:40:15 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 5.358749s, mse: 0.19703594
|
| 223 |
+
[2026-01-09 09:40:22 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 7.561985s, mse: 0.19626960
|
| 224 |
+
[2026-01-09 09:40:30 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 7.595258s, mse: 0.19534998
|
| 225 |
+
[2026-01-09 09:40:37 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 7.582781s, mse: 0.19473058
|
| 226 |
+
[2026-01-09 09:40:45 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 7.686873s, mse: 0.19404019
|
| 227 |
+
[2026-01-09 09:40:50 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 5.390258s, mse: 0.19356999
|
| 228 |
+
[2026-01-09 09:40:55 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 4.492068s, mse: 0.19326007
|
| 229 |
+
[2026-01-09 09:40:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 4.497151s, mse: 0.19282311
|
| 230 |
+
[2026-01-09 09:41:04 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 4.535296s, mse: 0.19267595
|
| 231 |
+
[2026-01-09 09:41:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 232 |
+
[2026-01-09 09:41:05 root] (train_utils.py 108): INFO ========= Layer 10 =========
|
| 233 |
+
[2026-01-09 09:41:13 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 4.746660s, mse: 0.44592521
|
| 234 |
+
[2026-01-09 09:41:17 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.956183s, mse: 0.28058022
|
| 235 |
+
[2026-01-09 09:41:21 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.949066s, mse: 0.22870731
|
| 236 |
+
[2026-01-09 09:41:28 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 7.370824s, mse: 0.21672769
|
| 237 |
+
[2026-01-09 09:41:36 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 8.410934s, mse: 0.21354958
|
| 238 |
+
[2026-01-09 09:41:45 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 8.433516s, mse: 0.21149486
|
| 239 |
+
[2026-01-09 09:41:53 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 8.380215s, mse: 0.21045262
|
| 240 |
+
[2026-01-09 09:42:02 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 8.411274s, mse: 0.20926467
|
| 241 |
+
[2026-01-09 09:42:08 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 6.738219s, mse: 0.20823501
|
| 242 |
+
[2026-01-09 09:42:12 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.919702s, mse: 0.20746952
|
| 243 |
+
[2026-01-09 09:42:16 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.885594s, mse: 0.20690618
|
| 244 |
+
[2026-01-09 09:42:20 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.890161s, mse: 0.20613439
|
| 245 |
+
[2026-01-09 09:42:24 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.880738s, mse: 0.20562243
|
| 246 |
+
[2026-01-09 09:42:31 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 7.122962s, mse: 0.20517452
|
| 247 |
+
[2026-01-09 09:42:39 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 8.400930s, mse: 0.20504668
|
| 248 |
+
[2026-01-09 09:42:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 249 |
+
[2026-01-09 09:42:41 root] (train_utils.py 108): INFO ========= Layer 11 =========
|
| 250 |
+
[2026-01-09 09:42:54 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 9.293272s, mse: 0.39262417
|
| 251 |
+
[2026-01-09 09:43:02 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 8.410596s, mse: 0.27127978
|
| 252 |
+
[2026-01-09 09:43:10 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 7.794915s, mse: 0.22630122
|
| 253 |
+
[2026-01-09 09:43:15 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 4.369798s, mse: 0.21789221
|
| 254 |
+
[2026-01-09 09:43:19 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 4.233581s, mse: 0.21573043
|
| 255 |
+
[2026-01-09 09:43:23 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 4.393658s, mse: 0.21401882
|
| 256 |
+
[2026-01-09 09:43:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 4.394202s, mse: 0.21313243
|
| 257 |
+
[2026-01-09 09:43:32 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 4.393358s, mse: 0.21215978
|
| 258 |
+
[2026-01-09 09:43:36 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 4.195683s, mse: 0.21121168
|
| 259 |
+
[2026-01-09 09:43:40 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.884118s, mse: 0.21032479
|
| 260 |
+
[2026-01-09 09:43:44 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.887403s, mse: 0.20987187
|
| 261 |
+
[2026-01-09 09:43:48 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.881930s, mse: 0.20908046
|
| 262 |
+
[2026-01-09 09:43:55 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 7.261625s, mse: 0.20848191
|
| 263 |
+
[2026-01-09 09:44:05 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 9.507146s, mse: 0.20800886
|
| 264 |
+
[2026-01-09 09:44:14 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 9.487793s, mse: 0.20795538
|
| 265 |
+
[2026-01-09 09:44:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 266 |
+
[2026-01-09 09:44:15 root] (train_utils.py 108): INFO ========= Layer 12 =========
|
| 267 |
+
[2026-01-09 09:44:30 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 10.434841s, mse: 0.43535280
|
| 268 |
+
[2026-01-09 09:44:40 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 9.477639s, mse: 0.29579335
|
| 269 |
+
[2026-01-09 09:44:49 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 9.506626s, mse: 0.24488190
|
| 270 |
+
[2026-01-09 09:44:59 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 9.498228s, mse: 0.23438135
|
| 271 |
+
[2026-01-09 09:45:05 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 6.546145s, mse: 0.23133603
|
| 272 |
+
[2026-01-09 09:45:10 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 4.671869s, mse: 0.22933656
|
| 273 |
+
[2026-01-09 09:45:15 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 5.492260s, mse: 0.22804067
|
| 274 |
+
[2026-01-09 09:45:20 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 4.276597s, mse: 0.22690852
|
| 275 |
+
[2026-01-09 09:45:24 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.885321s, mse: 0.22579126
|
| 276 |
+
[2026-01-09 09:45:27 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.886717s, mse: 0.22475064
|
| 277 |
+
[2026-01-09 09:45:31 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.877200s, mse: 0.22366890
|
| 278 |
+
[2026-01-09 09:45:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.898989s, mse: 0.22277188
|
| 279 |
+
[2026-01-09 09:45:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.874586s, mse: 0.22196589
|
| 280 |
+
[2026-01-09 09:45:43 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.879584s, mse: 0.22144113
|
| 281 |
+
[2026-01-09 09:45:47 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.883109s, mse: 0.22116731
|
| 282 |
+
[2026-01-09 09:45:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 283 |
+
[2026-01-09 09:45:48 root] (train_utils.py 108): INFO ========= Layer 13 =========
|
| 284 |
+
[2026-01-09 09:45:55 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 4.864463s, mse: 0.44991863
|
| 285 |
+
[2026-01-09 09:45:59 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.966472s, mse: 0.30773303
|
| 286 |
+
[2026-01-09 09:46:03 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.968499s, mse: 0.25602528
|
| 287 |
+
[2026-01-09 09:46:07 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.985150s, mse: 0.24593170
|
| 288 |
+
[2026-01-09 09:46:11 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.969827s, mse: 0.24332635
|
| 289 |
+
[2026-01-09 09:46:15 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.975120s, mse: 0.24169515
|
| 290 |
+
[2026-01-09 09:46:19 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.966719s, mse: 0.24032030
|
| 291 |
+
[2026-01-09 09:46:23 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.981713s, mse: 0.23895445
|
| 292 |
+
[2026-01-09 09:46:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.957695s, mse: 0.23795472
|
| 293 |
+
[2026-01-09 09:46:31 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.928493s, mse: 0.23691620
|
| 294 |
+
[2026-01-09 09:46:35 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.950974s, mse: 0.23617835
|
| 295 |
+
[2026-01-09 09:46:39 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 4.044775s, mse: 0.23538260
|
| 296 |
+
[2026-01-09 09:46:43 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 4.395989s, mse: 0.23459788
|
| 297 |
+
[2026-01-09 09:46:48 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 4.461857s, mse: 0.23386008
|
| 298 |
+
[2026-01-09 09:46:52 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 4.388942s, mse: 0.23347831
|
| 299 |
+
[2026-01-09 09:46:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 300 |
+
[2026-01-09 09:46:53 root] (train_utils.py 108): INFO ========= Layer 14 =========
|
| 301 |
+
[2026-01-09 09:47:01 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 5.077703s, mse: 0.48670265
|
| 302 |
+
[2026-01-09 09:47:05 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.912335s, mse: 0.32924685
|
| 303 |
+
[2026-01-09 09:47:10 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 5.693116s, mse: 0.27174610
|
| 304 |
+
[2026-01-09 09:47:18 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 7.564554s, mse: 0.26111004
|
| 305 |
+
[2026-01-09 09:47:25 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 7.573057s, mse: 0.25857583
|
| 306 |
+
[2026-01-09 09:47:33 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 7.558129s, mse: 0.25724220
|
| 307 |
+
[2026-01-09 09:47:41 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 7.571431s, mse: 0.25530052
|
| 308 |
+
[2026-01-09 09:47:45 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 4.817823s, mse: 0.25373703
|
| 309 |
+
[2026-01-09 09:47:49 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.882754s, mse: 0.25232333
|
| 310 |
+
[2026-01-09 09:47:55 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 5.495000s, mse: 0.25103748
|
| 311 |
+
[2026-01-09 09:48:02 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 7.215655s, mse: 0.24987648
|
| 312 |
+
[2026-01-09 09:48:09 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 7.181203s, mse: 0.24912813
|
| 313 |
+
[2026-01-09 09:48:16 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 7.226618s, mse: 0.24813016
|
| 314 |
+
[2026-01-09 09:48:23 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 7.023306s, mse: 0.24762598
|
| 315 |
+
[2026-01-09 09:48:27 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.881495s, mse: 0.24739194
|
| 316 |
+
[2026-01-09 09:48:28 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 317 |
+
[2026-01-09 09:48:28 root] (train_utils.py 108): INFO ========= Layer 15 =========
|
| 318 |
+
[2026-01-09 09:48:38 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 7.331558s, mse: 0.48941827
|
| 319 |
+
[2026-01-09 09:48:45 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 7.256174s, mse: 0.32720220
|
| 320 |
+
[2026-01-09 09:48:52 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 7.203066s, mse: 0.26854873
|
| 321 |
+
[2026-01-09 09:49:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 7.211432s, mse: 0.25705975
|
| 322 |
+
[2026-01-09 09:49:06 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 5.967390s, mse: 0.25422159
|
| 323 |
+
[2026-01-09 09:49:11 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 5.112001s, mse: 0.25197345
|
| 324 |
+
[2026-01-09 09:49:16 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 5.306431s, mse: 0.25026903
|
| 325 |
+
[2026-01-09 09:49:20 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.906909s, mse: 0.24867499
|
| 326 |
+
[2026-01-09 09:49:24 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 4.498396s, mse: 0.24771519
|
| 327 |
+
[2026-01-09 09:49:32 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 7.580038s, mse: 0.24665023
|
| 328 |
+
[2026-01-09 09:49:40 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 7.592080s, mse: 0.24558856
|
| 329 |
+
[2026-01-09 09:49:47 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 7.600457s, mse: 0.24435455
|
| 330 |
+
[2026-01-09 09:49:55 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 7.596599s, mse: 0.24346027
|
| 331 |
+
[2026-01-09 09:50:01 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 6.140972s, mse: 0.24292424
|
| 332 |
+
[2026-01-09 09:50:06 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 4.575392s, mse: 0.24260354
|
| 333 |
+
[2026-01-09 09:50:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 334 |
+
[2026-01-09 09:50:06 root] (train_utils.py 108): INFO ========= Layer 16 =========
|
| 335 |
+
[2026-01-09 09:50:15 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 5.665595s, mse: 3.09758520
|
| 336 |
+
[2026-01-09 09:50:19 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.916453s, mse: 1.53681600
|
| 337 |
+
[2026-01-09 09:50:23 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.887970s, mse: 1.37538433
|
| 338 |
+
[2026-01-09 09:50:27 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.882150s, mse: 1.14041376
|
| 339 |
+
[2026-01-09 09:50:32 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 5.692225s, mse: 1.13041377
|
| 340 |
+
[2026-01-09 09:50:41 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 8.392419s, mse: 1.17505825
|
| 341 |
+
[2026-01-09 09:50:49 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 8.391150s, mse: 1.00187659
|
| 342 |
+
[2026-01-09 09:50:58 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 8.387745s, mse: 1.15916288
|
| 343 |
+
[2026-01-09 09:51:06 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 8.414940s, mse: 0.93556213
|
| 344 |
+
[2026-01-09 09:51:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 8.394004s, mse: 0.89307052
|
| 345 |
+
[2026-01-09 09:51:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.919834s, mse: 1.08854449
|
| 346 |
+
[2026-01-09 09:51:22 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.896248s, mse: 0.78587675
|
| 347 |
+
[2026-01-09 09:51:26 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.886925s, mse: 0.77024889
|
| 348 |
+
[2026-01-09 09:51:30 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.890127s, mse: 0.74143833
|
| 349 |
+
[2026-01-09 09:51:34 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.933678s, mse: 0.62904388
|
| 350 |
+
[2026-01-09 09:51:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 351 |
+
[2026-01-09 09:51:35 root] (train_utils.py 108): INFO ========= Layer 17 =========
|
| 352 |
+
[2026-01-09 09:51:49 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 9.861759s, mse: 0.57632238
|
| 353 |
+
[2026-01-09 09:51:58 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 8.438567s, mse: 0.38568184
|
| 354 |
+
[2026-01-09 09:52:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 8.434239s, mse: 0.30990756
|
| 355 |
+
[2026-01-09 09:52:14 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 8.419053s, mse: 0.29348093
|
| 356 |
+
[2026-01-09 09:52:20 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 5.675947s, mse: 0.28841209
|
| 357 |
+
[2026-01-09 09:52:25 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 4.414869s, mse: 0.28536177
|
| 358 |
+
[2026-01-09 09:52:29 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 4.475958s, mse: 0.28336507
|
| 359 |
+
[2026-01-09 09:52:33 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 4.307231s, mse: 0.28023016
|
| 360 |
+
[2026-01-09 09:52:38 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 4.318885s, mse: 0.27797151
|
| 361 |
+
[2026-01-09 09:52:42 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 4.301153s, mse: 0.27724716
|
| 362 |
+
[2026-01-09 09:52:46 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.976857s, mse: 0.27549568
|
| 363 |
+
[2026-01-09 09:52:50 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.880923s, mse: 0.27411795
|
| 364 |
+
[2026-01-09 09:52:54 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.900098s, mse: 0.27230272
|
| 365 |
+
[2026-01-09 09:52:58 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.882901s, mse: 0.27161792
|
| 366 |
+
[2026-01-09 09:53:05 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 6.952255s, mse: 0.27142629
|
| 367 |
+
[2026-01-09 09:53:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 368 |
+
[2026-01-09 09:53:06 root] (train_utils.py 108): INFO ========= Layer 18 =========
|
| 369 |
+
[2026-01-09 09:53:21 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 10.575045s, mse: 0.68219566
|
| 370 |
+
[2026-01-09 09:53:30 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 9.497893s, mse: 0.44933167
|
| 371 |
+
[2026-01-09 09:53:40 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 9.470748s, mse: 0.36149144
|
| 372 |
+
[2026-01-09 09:53:49 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 9.480987s, mse: 0.34437451
|
| 373 |
+
[2026-01-09 09:53:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 9.508292s, mse: 0.33928376
|
| 374 |
+
[2026-01-09 09:54:08 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 9.476126s, mse: 0.33628541
|
| 375 |
+
[2026-01-09 09:54:15 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 6.729955s, mse: 0.33380261
|
| 376 |
+
[2026-01-09 09:54:19 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 4.353182s, mse: 0.33132178
|
| 377 |
+
[2026-01-09 09:54:24 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 4.857163s, mse: 0.32943395
|
| 378 |
+
[2026-01-09 09:54:29 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 5.068378s, mse: 0.32786560
|
| 379 |
+
[2026-01-09 09:54:33 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.990787s, mse: 0.32583937
|
| 380 |
+
[2026-01-09 09:54:37 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.884449s, mse: 0.32450172
|
| 381 |
+
[2026-01-09 09:54:41 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.886384s, mse: 0.32264820
|
| 382 |
+
[2026-01-09 09:54:45 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.888823s, mse: 0.32187557
|
| 383 |
+
[2026-01-09 09:54:49 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.888052s, mse: 0.32105669
|
| 384 |
+
[2026-01-09 09:54:49 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 385 |
+
[2026-01-09 09:54:50 root] (train_utils.py 108): INFO ========= Layer 19 =========
|
| 386 |
+
[2026-01-09 09:54:57 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 4.764529s, mse: 0.88728219
|
| 387 |
+
[2026-01-09 09:55:01 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 3.909924s, mse: 0.57078516
|
| 388 |
+
[2026-01-09 09:55:05 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 4.041203s, mse: 0.45792666
|
| 389 |
+
[2026-01-09 09:55:09 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.979156s, mse: 0.43537480
|
| 390 |
+
[2026-01-09 09:55:13 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.976704s, mse: 0.42894897
|
| 391 |
+
[2026-01-09 09:55:17 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.986418s, mse: 0.42462113
|
| 392 |
+
[2026-01-09 09:55:21 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.974231s, mse: 0.42157629
|
| 393 |
+
[2026-01-09 09:55:25 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.975169s, mse: 0.41864219
|
| 394 |
+
[2026-01-09 09:55:29 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.985224s, mse: 0.41570342
|
| 395 |
+
[2026-01-09 09:55:33 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.964523s, mse: 0.41345572
|
| 396 |
+
[2026-01-09 09:55:37 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.971512s, mse: 0.41054672
|
| 397 |
+
[2026-01-09 09:55:41 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.981757s, mse: 0.40846488
|
| 398 |
+
[2026-01-09 09:55:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.930552s, mse: 0.40727249
|
| 399 |
+
[2026-01-09 09:55:48 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.955016s, mse: 0.40628025
|
| 400 |
+
[2026-01-09 09:55:53 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 4.286919s, mse: 0.40573606
|
| 401 |
+
[2026-01-09 09:55:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 402 |
+
[2026-01-09 09:55:54 root] (train_utils.py 108): INFO ========= Layer 20 =========
|
| 403 |
+
[2026-01-09 09:56:02 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 5.274112s, mse: 0.88836050
|
| 404 |
+
[2026-01-09 09:56:06 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 4.440670s, mse: 0.59483135
|
| 405 |
+
[2026-01-09 09:56:11 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 4.416258s, mse: 0.48579982
|
| 406 |
+
[2026-01-09 09:56:14 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.951197s, mse: 0.46583182
|
| 407 |
+
[2026-01-09 09:56:19 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 4.961442s, mse: 0.46044937
|
| 408 |
+
[2026-01-09 09:56:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 7.579243s, mse: 0.45749170
|
| 409 |
+
[2026-01-09 09:56:35 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 7.570611s, mse: 0.45316568
|
| 410 |
+
[2026-01-09 09:56:42 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 7.582004s, mse: 0.45053339
|
| 411 |
+
[2026-01-09 09:56:50 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 7.588964s, mse: 0.44832462
|
| 412 |
+
[2026-01-09 09:56:55 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 5.473780s, mse: 0.44616416
|
| 413 |
+
[2026-01-09 09:56:59 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.885696s, mse: 0.44334349
|
| 414 |
+
[2026-01-09 09:57:04 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 4.542222s, mse: 0.44204527
|
| 415 |
+
[2026-01-09 09:57:11 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 7.207450s, mse: 0.43987796
|
| 416 |
+
[2026-01-09 09:57:18 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 7.203997s, mse: 0.43863490
|
| 417 |
+
[2026-01-09 09:57:25 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 7.229440s, mse: 0.43791217
|
| 418 |
+
[2026-01-09 09:57:26 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 419 |
+
[2026-01-09 09:57:26 root] (train_utils.py 108): INFO ========= Layer 21 =========
|
| 420 |
+
[2026-01-09 09:57:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.521300s, mse: 1.18043423
|
| 421 |
+
[2026-01-09 09:57:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.878793s, mse: 0.77954561
|
| 422 |
+
[2026-01-09 09:57:45 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 5.689105s, mse: 0.64111829
|
| 423 |
+
[2026-01-09 09:57:52 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 7.237405s, mse: 0.61397409
|
| 424 |
+
[2026-01-09 09:58:00 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 7.234711s, mse: 0.60631013
|
| 425 |
+
[2026-01-09 09:58:07 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 7.257921s, mse: 0.60047567
|
| 426 |
+
[2026-01-09 09:58:13 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 6.723596s, mse: 0.59512597
|
| 427 |
+
[2026-01-09 09:58:18 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 4.947057s, mse: 0.59215677
|
| 428 |
+
[2026-01-09 09:58:24 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 5.440034s, mse: 0.58796024
|
| 429 |
+
[2026-01-09 09:58:28 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 4.097990s, mse: 0.58513182
|
| 430 |
+
[2026-01-09 09:58:32 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.881567s, mse: 0.58225924
|
| 431 |
+
[2026-01-09 09:58:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 7.507426s, mse: 0.57988369
|
| 432 |
+
[2026-01-09 09:58:47 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 7.586020s, mse: 0.57718277
|
| 433 |
+
[2026-01-09 09:58:55 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 7.583313s, mse: 0.57546204
|
| 434 |
+
[2026-01-09 09:59:02 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 7.600305s, mse: 0.57469940
|
| 435 |
+
[2026-01-09 09:59:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 436 |
+
[2026-01-09 09:59:03 root] (train_utils.py 108): INFO ========= Layer 22 =========
|
| 437 |
+
[2026-01-09 09:59:13 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.760921s, mse: 1.88664389
|
| 438 |
+
[2026-01-09 09:59:18 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 4.911954s, mse: 1.18959606
|
| 439 |
+
[2026-01-09 09:59:22 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 4.495867s, mse: 0.95907360
|
| 440 |
+
[2026-01-09 09:59:26 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.928732s, mse: 0.91428280
|
| 441 |
+
[2026-01-09 09:59:30 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.887553s, mse: 0.90376323
|
| 442 |
+
[2026-01-09 09:59:34 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.897269s, mse: 0.89363086
|
| 443 |
+
[2026-01-09 09:59:41 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 7.003159s, mse: 0.88751125
|
| 444 |
+
[2026-01-09 09:59:49 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 8.405993s, mse: 0.87932986
|
| 445 |
+
[2026-01-09 09:59:58 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 8.416431s, mse: 0.87506205
|
| 446 |
+
[2026-01-09 10:00:06 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 8.401184s, mse: 0.86960399
|
| 447 |
+
[2026-01-09 10:00:14 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 8.401504s, mse: 0.86433518
|
| 448 |
+
[2026-01-09 10:00:21 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 7.016203s, mse: 0.85831034
|
| 449 |
+
[2026-01-09 10:00:25 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.926229s, mse: 0.85434479
|
| 450 |
+
[2026-01-09 10:00:29 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.888714s, mse: 0.85274106
|
| 451 |
+
[2026-01-09 10:00:33 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.876585s, mse: 0.85105854
|
| 452 |
+
[2026-01-09 10:00:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 453 |
+
[2026-01-09 10:00:34 root] (train_utils.py 108): INFO ========= Layer 23 =========
|
| 454 |
+
[2026-01-09 10:00:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 9.105386s, mse: 2.56160784
|
| 455 |
+
[2026-01-09 10:00:54 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 8.433470s, mse: 1.69400561
|
| 456 |
+
[2026-01-09 10:01:02 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 8.508389s, mse: 1.40092814
|
| 457 |
+
[2026-01-09 10:01:11 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 8.423953s, mse: 1.33960748
|
| 458 |
+
[2026-01-09 10:01:19 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 8.422999s, mse: 1.31923652
|
| 459 |
+
[2026-01-09 10:01:25 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 5.676231s, mse: 1.30260742
|
| 460 |
+
[2026-01-09 10:01:29 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 4.424892s, mse: 1.29341400
|
| 461 |
+
[2026-01-09 10:01:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 4.521775s, mse: 1.28473794
|
| 462 |
+
[2026-01-09 10:01:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 4.509436s, mse: 1.27725101
|
| 463 |
+
[2026-01-09 10:01:43 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 4.399419s, mse: 1.27071691
|
| 464 |
+
[2026-01-09 10:01:47 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 4.038232s, mse: 1.26552820
|
| 465 |
+
[2026-01-09 10:01:51 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.967611s, mse: 1.26018000
|
| 466 |
+
[2026-01-09 10:01:55 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.895498s, mse: 1.25696874
|
| 467 |
+
[2026-01-09 10:01:59 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.883907s, mse: 1.25348544
|
| 468 |
+
[2026-01-09 10:02:06 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 6.941385s, mse: 1.25113153
|
| 469 |
+
[2026-01-09 10:02:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 470 |
+
[2026-01-09 10:02:06 root] (train_utils.py 108): INFO ========= Layer 24 =========
|
| 471 |
+
[2026-01-09 10:02:21 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 10.583919s, mse: 3.33080626
|
| 472 |
+
[2026-01-09 10:02:31 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 9.517836s, mse: 2.21739531
|
| 473 |
+
[2026-01-09 10:02:41 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 9.536418s, mse: 1.83558488
|
| 474 |
+
[2026-01-09 10:02:50 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 9.505691s, mse: 1.75192118
|
| 475 |
+
[2026-01-09 10:03:00 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 9.494667s, mse: 1.73021388
|
| 476 |
+
[2026-01-09 10:03:09 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 9.531198s, mse: 1.70965135
|
| 477 |
+
[2026-01-09 10:03:16 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 7.118713s, mse: 1.69753647
|
| 478 |
+
[2026-01-09 10:03:20 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 4.266183s, mse: 1.68364048
|
| 479 |
+
[2026-01-09 10:03:25 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 4.886819s, mse: 1.67123342
|
| 480 |
+
[2026-01-09 10:03:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 5.085745s, mse: 1.66224420
|
| 481 |
+
[2026-01-09 10:03:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 4.032805s, mse: 1.65476453
|
| 482 |
+
[2026-01-09 10:03:38 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.882242s, mse: 1.64498436
|
| 483 |
+
[2026-01-09 10:03:42 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 3.881795s, mse: 1.63647079
|
| 484 |
+
[2026-01-09 10:03:46 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 3.892410s, mse: 1.63291585
|
| 485 |
+
[2026-01-09 10:03:50 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 3.886352s, mse: 1.63007939
|
| 486 |
+
[2026-01-09 10:03:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 487 |
+
[2026-01-09 10:03:51 root] (train_utils.py 108): INFO ========= Layer 25 =========
|
| 488 |
+
[2026-01-09 10:03:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 4.754002s, mse: 3.67945337
|
| 489 |
+
[2026-01-09 10:04:02 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.891057s, mse: 2.39840055
|
| 490 |
+
[2026-01-09 10:04:06 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.965408s, mse: 2.00158238
|
| 491 |
+
[2026-01-09 10:04:10 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.975547s, mse: 1.92655563
|
| 492 |
+
[2026-01-09 10:04:14 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.992167s, mse: 1.90741169
|
| 493 |
+
[2026-01-09 10:04:18 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.972306s, mse: 1.89064825
|
| 494 |
+
[2026-01-09 10:04:22 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.959593s, mse: 1.88254857
|
| 495 |
+
[2026-01-09 10:04:26 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.978096s, mse: 1.87189174
|
| 496 |
+
[2026-01-09 10:04:30 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 4.038383s, mse: 1.86226833
|
| 497 |
+
[2026-01-09 10:04:34 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.965123s, mse: 1.85414529
|
| 498 |
+
[2026-01-09 10:04:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.987170s, mse: 1.84632003
|
| 499 |
+
[2026-01-09 10:04:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.968241s, mse: 1.83962476
|
| 500 |
+
[2026-01-09 10:04:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.965485s, mse: 1.83272731
|
| 501 |
+
[2026-01-09 10:04:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 3.952049s, mse: 1.83188641
|
| 502 |
+
[2026-01-09 10:04:54 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 4.158629s, mse: 1.82856822
|
| 503 |
+
[2026-01-09 10:04:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 504 |
+
[2026-01-09 10:04:55 root] (train_utils.py 108): INFO ========= Layer 26 =========
|
| 505 |
+
[2026-01-09 10:05:03 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.414984s, mse: 4.35819054
|
| 506 |
+
[2026-01-09 10:05:08 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 4.391829s, mse: 2.94494462
|
| 507 |
+
[2026-01-09 10:05:12 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 4.448181s, mse: 2.46222878
|
| 508 |
+
[2026-01-09 10:05:16 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 4.035336s, mse: 2.36697221
|
| 509 |
+
[2026-01-09 10:05:20 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.881711s, mse: 2.34871936
|
| 510 |
+
[2026-01-09 10:05:27 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 7.229694s, mse: 2.33013940
|
| 511 |
+
[2026-01-09 10:05:35 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 7.588176s, mse: 2.31725478
|
| 512 |
+
[2026-01-09 10:05:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 7.586895s, mse: 2.30295658
|
| 513 |
+
[2026-01-09 10:05:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 7.570282s, mse: 2.29171467
|
| 514 |
+
[2026-01-09 10:05:57 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 6.883629s, mse: 2.28112888
|
| 515 |
+
[2026-01-09 10:06:01 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.904047s, mse: 2.27260423
|
| 516 |
+
[2026-01-09 10:06:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.889935s, mse: 2.26187754
|
| 517 |
+
[2026-01-09 10:06:12 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 7.026200s, mse: 2.25517917
|
| 518 |
+
[2026-01-09 10:06:19 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 7.208640s, mse: 2.24800634
|
| 519 |
+
[2026-01-09 10:06:26 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 7.211710s, mse: 2.24403787
|
| 520 |
+
[2026-01-09 10:06:27 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 521 |
+
[2026-01-09 10:06:27 root] (train_utils.py 108): INFO ========= Layer 27 =========
|
| 522 |
+
[2026-01-09 10:06:37 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 6.275254s, mse: 5.94560862
|
| 523 |
+
[2026-01-09 10:06:41 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.890866s, mse: 3.95834851
|
| 524 |
+
[2026-01-09 10:06:46 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 4.665193s, mse: 3.32281756
|
| 525 |
+
[2026-01-09 10:06:53 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 7.189707s, mse: 3.18086267
|
| 526 |
+
[2026-01-09 10:07:00 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 7.197548s, mse: 3.14467168
|
| 527 |
+
[2026-01-09 10:07:07 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 7.219765s, mse: 3.12000346
|
| 528 |
+
[2026-01-09 10:07:15 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 7.202462s, mse: 3.09776139
|
| 529 |
+
[2026-01-09 10:07:20 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 5.093818s, mse: 3.07834363
|
| 530 |
+
[2026-01-09 10:07:25 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 5.226724s, mse: 3.06277657
|
| 531 |
+
[2026-01-09 10:07:30 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 4.802370s, mse: 3.04591680
|
| 532 |
+
[2026-01-09 10:07:34 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.894957s, mse: 3.03134632
|
| 533 |
+
[2026-01-09 10:07:40 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 6.565792s, mse: 3.01916480
|
| 534 |
+
[2026-01-09 10:07:48 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 7.576006s, mse: 3.00719571
|
| 535 |
+
[2026-01-09 10:07:55 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 7.576934s, mse: 2.99984956
|
| 536 |
+
[2026-01-09 10:08:03 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 7.584801s, mse: 2.99120903
|
| 537 |
+
[2026-01-09 10:08:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 538 |
+
[2026-01-09 10:08:04 root] (train_utils.py 108): INFO ========= Layer 28 =========
|
| 539 |
+
[2026-01-09 10:08:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 5.645861s, mse: 8.40579605
|
| 540 |
+
[2026-01-09 10:08:18 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 4.716225s, mse: 5.55529737
|
| 541 |
+
[2026-01-09 10:08:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 4.781032s, mse: 4.64479589
|
| 542 |
+
[2026-01-09 10:08:27 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 4.215279s, mse: 4.46341419
|
| 543 |
+
[2026-01-09 10:08:31 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.945424s, mse: 4.40386772
|
| 544 |
+
[2026-01-09 10:08:35 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.895538s, mse: 4.37245226
|
| 545 |
+
[2026-01-09 10:08:39 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 4.438238s, mse: 4.34240580
|
| 546 |
+
[2026-01-09 10:08:48 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 8.389002s, mse: 4.31763363
|
| 547 |
+
[2026-01-09 10:08:56 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 8.383581s, mse: 4.29854107
|
| 548 |
+
[2026-01-09 10:09:05 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 8.399694s, mse: 4.28071547
|
| 549 |
+
[2026-01-09 10:09:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 8.417864s, mse: 4.26679897
|
| 550 |
+
[2026-01-09 10:09:21 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 8.410913s, mse: 4.24268007
|
| 551 |
+
[2026-01-09 10:09:26 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 5.137883s, mse: 4.22641373
|
| 552 |
+
[2026-01-09 10:09:30 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.939935s, mse: 4.22128248
|
| 553 |
+
[2026-01-09 10:09:34 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.886578s, mse: 4.21494389
|
| 554 |
+
[2026-01-09 10:09:35 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 555 |
+
[2026-01-09 10:09:35 root] (train_utils.py 108): INFO ========= Layer 29 =========
|
| 556 |
+
[2026-01-09 10:09:46 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 7.813271s, mse: 10.38746834
|
| 557 |
+
[2026-01-09 10:09:54 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 8.429181s, mse: 7.14648628
|
| 558 |
+
[2026-01-09 10:10:02 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 8.383594s, mse: 6.03318691
|
| 559 |
+
[2026-01-09 10:10:11 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 8.417278s, mse: 5.78764057
|
| 560 |
+
[2026-01-09 10:10:19 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 8.413698s, mse: 5.71550655
|
| 561 |
+
[2026-01-09 10:10:26 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 7.294332s, mse: 5.66473246
|
| 562 |
+
[2026-01-09 10:10:31 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 4.335289s, mse: 5.61916113
|
| 563 |
+
[2026-01-09 10:10:35 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 4.477592s, mse: 5.58458805
|
| 564 |
+
[2026-01-09 10:10:40 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 4.570168s, mse: 5.54784393
|
| 565 |
+
[2026-01-09 10:10:44 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 4.581471s, mse: 5.52231646
|
| 566 |
+
[2026-01-09 10:10:49 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 4.188355s, mse: 5.48976994
|
| 567 |
+
[2026-01-09 10:10:52 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.879935s, mse: 5.46507311
|
| 568 |
+
[2026-01-09 10:10:56 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.884995s, mse: 5.44575977
|
| 569 |
+
[2026-01-09 10:11:03 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 6.919443s, mse: 5.43577242
|
| 570 |
+
[2026-01-09 10:11:13 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 9.446140s, mse: 5.42604542
|
| 571 |
+
[2026-01-09 10:11:13 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 572 |
+
[2026-01-09 10:11:14 root] (train_utils.py 108): INFO ========= Layer 30 =========
|
| 573 |
+
[2026-01-09 10:11:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 10.467985s, mse: 16.29405975
|
| 574 |
+
[2026-01-09 10:11:38 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 9.468451s, mse: 11.01632500
|
| 575 |
+
[2026-01-09 10:11:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 9.468787s, mse: 9.27882481
|
| 576 |
+
[2026-01-09 10:11:57 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 9.456441s, mse: 8.87542439
|
| 577 |
+
[2026-01-09 10:12:07 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 9.510264s, mse: 8.75351048
|
| 578 |
+
[2026-01-09 10:12:14 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 7.368535s, mse: 8.65880680
|
| 579 |
+
[2026-01-09 10:12:18 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 4.237104s, mse: 8.60634327
|
| 580 |
+
[2026-01-09 10:12:23 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 4.838456s, mse: 8.53597736
|
| 581 |
+
[2026-01-09 10:12:28 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 5.074469s, mse: 8.50352001
|
| 582 |
+
[2026-01-09 10:12:32 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 4.104348s, mse: 8.44190311
|
| 583 |
+
[2026-01-09 10:12:36 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.886484s, mse: 8.40491486
|
| 584 |
+
[2026-01-09 10:12:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.886079s, mse: 8.38511753
|
| 585 |
+
[2026-01-09 10:12:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.892271s, mse: 8.35692787
|
| 586 |
+
[2026-01-09 10:12:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.886010s, mse: 8.35674667
|
| 587 |
+
[2026-01-09 10:12:52 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.880266s, mse: 8.34408569
|
| 588 |
+
[2026-01-09 10:12:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 589 |
+
[2026-01-09 10:12:52 root] (train_utils.py 108): INFO ========= Layer 31 =========
|
| 590 |
+
[2026-01-09 10:13:00 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 4.802090s, mse: 20.78250885
|
| 591 |
+
[2026-01-09 10:13:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 4.032989s, mse: 14.37235165
|
| 592 |
+
[2026-01-09 10:13:08 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.989193s, mse: 12.13233566
|
| 593 |
+
[2026-01-09 10:13:12 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.970271s, mse: 11.62570667
|
| 594 |
+
[2026-01-09 10:13:16 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.965022s, mse: 11.51362991
|
| 595 |
+
[2026-01-09 10:13:20 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.978235s, mse: 11.42485142
|
| 596 |
+
[2026-01-09 10:13:24 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.956572s, mse: 11.33607769
|
| 597 |
+
[2026-01-09 10:13:28 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.983628s, mse: 11.27843571
|
| 598 |
+
[2026-01-09 10:13:32 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.971437s, mse: 11.22037888
|
| 599 |
+
[2026-01-09 10:13:36 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.981398s, mse: 11.15839195
|
| 600 |
+
[2026-01-09 10:13:40 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.970976s, mse: 11.12734127
|
| 601 |
+
[2026-01-09 10:13:43 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.968406s, mse: 11.08810806
|
| 602 |
+
[2026-01-09 10:13:47 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.966583s, mse: 11.05513668
|
| 603 |
+
[2026-01-09 10:13:52 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 4.241404s, mse: 11.03436947
|
| 604 |
+
[2026-01-09 10:13:56 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 4.397378s, mse: 11.01393795
|
| 605 |
+
[2026-01-09 10:13:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 606 |
+
[2026-01-09 10:13:57 root] (train_utils.py 108): INFO ========= Layer 32 =========
|
| 607 |
+
[2026-01-09 10:14:05 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.240127s, mse: 28.37956429
|
| 608 |
+
[2026-01-09 10:14:10 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 4.407370s, mse: 19.76789856
|
| 609 |
+
[2026-01-09 10:14:14 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 4.113342s, mse: 16.61169624
|
| 610 |
+
[2026-01-09 10:14:18 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.878139s, mse: 15.88970184
|
| 611 |
+
[2026-01-09 10:14:24 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 6.959631s, mse: 15.74769402
|
| 612 |
+
[2026-01-09 10:14:32 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 7.576349s, mse: 15.61922455
|
| 613 |
+
[2026-01-09 10:14:40 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 7.601277s, mse: 15.51004982
|
| 614 |
+
[2026-01-09 10:14:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 7.561176s, mse: 15.42904854
|
| 615 |
+
[2026-01-09 10:14:54 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 7.163068s, mse: 15.34880447
|
| 616 |
+
[2026-01-09 10:14:58 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.901019s, mse: 15.27359772
|
| 617 |
+
[2026-01-09 10:15:02 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.876441s, mse: 15.21441174
|
| 618 |
+
[2026-01-09 10:15:09 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 6.410050s, mse: 15.16252708
|
| 619 |
+
[2026-01-09 10:15:16 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 7.196454s, mse: 15.10843849
|
| 620 |
+
[2026-01-09 10:15:23 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 7.212720s, mse: 15.08382893
|
| 621 |
+
[2026-01-09 10:15:30 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 7.231273s, mse: 15.06546974
|
| 622 |
+
[2026-01-09 10:15:31 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 623 |
+
[2026-01-09 10:15:31 root] (train_utils.py 108): INFO ========= Layer 33 =========
|
| 624 |
+
[2026-01-09 10:15:39 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 4.706686s, mse: 41.54327011
|
| 625 |
+
[2026-01-09 10:15:44 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 4.436514s, mse: 27.93664551
|
| 626 |
+
[2026-01-09 10:15:51 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 7.198779s, mse: 23.32941628
|
| 627 |
+
[2026-01-09 10:15:58 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 7.216425s, mse: 22.34293175
|
| 628 |
+
[2026-01-09 10:16:05 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 7.222544s, mse: 22.07669640
|
| 629 |
+
[2026-01-09 10:16:12 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 7.230317s, mse: 21.87960243
|
| 630 |
+
[2026-01-09 10:16:18 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 5.261297s, mse: 21.73635674
|
| 631 |
+
[2026-01-09 10:16:23 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 5.232856s, mse: 21.58724403
|
| 632 |
+
[2026-01-09 10:16:28 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 4.793520s, mse: 21.46766853
|
| 633 |
+
[2026-01-09 10:16:32 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.886966s, mse: 21.36098099
|
| 634 |
+
[2026-01-09 10:16:37 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 5.417264s, mse: 21.27636719
|
| 635 |
+
[2026-01-09 10:16:45 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 7.600954s, mse: 21.16030693
|
| 636 |
+
[2026-01-09 10:16:52 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 7.574526s, mse: 21.07536125
|
| 637 |
+
[2026-01-09 10:17:00 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 7.584183s, mse: 20.99114990
|
| 638 |
+
[2026-01-09 10:17:07 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 7.586675s, mse: 20.95961761
|
| 639 |
+
[2026-01-09 10:17:08 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 640 |
+
[2026-01-09 10:17:08 root] (train_utils.py 108): INFO ========= Layer 34 =========
|
| 641 |
+
[2026-01-09 10:17:17 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 5.584197s, mse: 64.93594360
|
| 642 |
+
[2026-01-09 10:17:22 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 4.842231s, mse: 40.86461258
|
| 643 |
+
[2026-01-09 10:17:26 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 4.336210s, mse: 33.65349960
|
| 644 |
+
[2026-01-09 10:17:30 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.875152s, mse: 31.96302605
|
| 645 |
+
[2026-01-09 10:17:34 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.886546s, mse: 31.66926384
|
| 646 |
+
[2026-01-09 10:17:39 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 5.056141s, mse: 31.07656479
|
| 647 |
+
[2026-01-09 10:17:47 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 8.413598s, mse: 30.91048813
|
| 648 |
+
[2026-01-09 10:17:56 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 8.436667s, mse: 30.05115700
|
| 649 |
+
[2026-01-09 10:18:04 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 8.411937s, mse: 29.89023590
|
| 650 |
+
[2026-01-09 10:18:12 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 8.405565s, mse: 30.35319901
|
| 651 |
+
[2026-01-09 10:18:21 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 8.418024s, mse: 29.46559715
|
| 652 |
+
[2026-01-09 10:18:25 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 4.597981s, mse: 29.05239487
|
| 653 |
+
[2026-01-09 10:18:29 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.877557s, mse: 28.86521339
|
| 654 |
+
[2026-01-09 10:18:33 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.883606s, mse: 28.74409676
|
| 655 |
+
[2026-01-09 10:18:37 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 4.302986s, mse: 28.70412636
|
| 656 |
+
[2026-01-09 10:18:38 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 657 |
+
[2026-01-09 10:18:38 root] (train_utils.py 108): INFO ========= Layer 35 =========
|
| 658 |
+
[2026-01-09 10:18:52 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 9.411074s, mse: 108.25781250
|
| 659 |
+
[2026-01-09 10:19:00 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 8.419429s, mse: 38.04971313
|
| 660 |
+
[2026-01-09 10:19:09 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 8.463986s, mse: 31.63025665
|
| 661 |
+
[2026-01-09 10:19:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 8.413347s, mse: 29.21376991
|
| 662 |
+
[2026-01-09 10:19:23 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 6.243543s, mse: 28.19089508
|
| 663 |
+
[2026-01-09 10:19:28 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 4.387824s, mse: 28.40728760
|
| 664 |
+
[2026-01-09 10:19:32 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 4.512519s, mse: 27.74842644
|
| 665 |
+
[2026-01-09 10:19:37 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 4.493203s, mse: 27.13273811
|
| 666 |
+
[2026-01-09 10:19:41 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 4.561685s, mse: 26.53238487
|
| 667 |
+
[2026-01-09 10:19:46 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 4.119229s, mse: 26.14052200
|
| 668 |
+
[2026-01-09 10:19:49 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.929297s, mse: 25.63203621
|
| 669 |
+
[2026-01-09 10:19:53 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.883827s, mse: 25.35079384
|
| 670 |
+
[2026-01-09 10:20:01 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 7.978094s, mse: 25.21109390
|
| 671 |
+
[2026-01-09 10:20:11 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 9.496463s, mse: 24.95710945
|
| 672 |
+
[2026-01-09 10:20:20 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 9.466582s, mse: 24.85692596
|
| 673 |
+
[2026-01-09 10:20:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
|
| 674 |
+
[2026-01-09 10:21:06 root] (main.py 39): INFO Finished reparameterize model.
|
| 675 |
+
[2026-01-09 10:21:27 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
|
| 676 |
+
[2026-01-09 10:21:50 root] (flat_utils.py 231): INFO saved weights at ./outputs/Qwen3-8B/w4a4/exp
|
| 677 |
+
[2026-01-09 10:22:00 root] (main.py 60): INFO wikitext2
|
| 678 |
+
[2026-01-09 10:22:44 root] (main.py 69): INFO 10.263629913330078
|
| 679 |
+
[2026-01-09 10:22:44 root] (main.py 60): INFO c4
|
| 680 |
+
[2026-01-09 10:24:57 root] (main.py 69): INFO 16.17665672302246
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_062728.txt
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 06:27:28 root] (args_utils.py 168): INFO Arguments:
|
| 2 |
+
[2026-01-12 06:27:28 root] (args_utils.py 169): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': True,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': False,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-12 06:27:28 root] (args_utils.py 170): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-12 06:27:29 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-12 06:27:50 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-12 06:27:55 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-12 06:28:38 root] (main.py 39): INFO Finished reparameterize model.
|
| 63 |
+
[2026-01-12 06:29:06 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_063624.txt
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 06:36:24 root] (args_utils.py 168): INFO Arguments:
|
| 2 |
+
[2026-01-12 06:36:24 root] (args_utils.py 169): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'seed': 0,
|
| 44 |
+
'separate_vtrans': False,
|
| 45 |
+
'tasks': ['piqa',
|
| 46 |
+
'hellaswag',
|
| 47 |
+
'arc_easy',
|
| 48 |
+
'arc_challenge',
|
| 49 |
+
'winogrande',
|
| 50 |
+
'lambada_openai'],
|
| 51 |
+
'v_asym': False,
|
| 52 |
+
'v_bits': 16,
|
| 53 |
+
'v_groupsize': -1,
|
| 54 |
+
'w_asym': False,
|
| 55 |
+
'w_bits': 4,
|
| 56 |
+
'w_groupsize': 128,
|
| 57 |
+
'warmup': False}
|
| 58 |
+
[2026-01-12 06:36:24 root] (args_utils.py 170): INFO ------------------------------------------------------------
|
| 59 |
+
[2026-01-12 06:36:25 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 60 |
+
[2026-01-12 06:36:43 root] (main.py 25): INFO Finished loading training data.
|
| 61 |
+
[2026-01-12 06:36:49 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 62 |
+
[2026-01-12 06:36:53 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 63 |
+
[2026-01-12 06:37:33 root] (main.py 39): INFO Finished reparameterize model.
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_155601.txt
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 15:56:01 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 15:56:01 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 15:56:01 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 15:56:02 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 15:56:23 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 15:56:30 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 15:56:34 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 15:57:23 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 15:58:02 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_160154.txt
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 16:01:54 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 16:01:54 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 16:01:54 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 16:01:55 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 16:02:13 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 16:02:21 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 16:02:25 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 16:03:15 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 16:03:53 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
| 66 |
+
[2026-01-12 16:03:56 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 67 |
+
[2026-01-12 16:04:16 root] (main.py 98): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
|
| 68 |
+
[2026-01-12 16:04:30 root] (main.py 107): INFO wikitext2
|
| 69 |
+
[2026-01-12 16:09:16 root] (main.py 116): INFO 10.263629913330078
|
| 70 |
+
[2026-01-12 16:09:16 root] (main.py 107): INFO c4
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_163532.txt
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 16:35:32 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 16:35:32 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 16:35:32 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 16:35:33 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 16:35:51 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 16:35:57 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 16:36:00 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 16:36:45 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 16:37:16 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
| 66 |
+
[2026-01-12 16:37:18 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 67 |
+
[2026-01-12 16:37:45 root] (main.py 145): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
|
| 68 |
+
[2026-01-12 16:37:57 root] (main.py 154): INFO wikitext2
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173005.txt
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 17:30:05 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 17:30:05 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 17:30:05 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 17:30:06 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 17:30:26 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 17:30:33 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 17:30:37 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 17:31:19 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 17:31:54 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
| 66 |
+
[2026-01-12 17:31:56 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 67 |
+
[2026-01-12 17:32:19 root] (main.py 98): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
|
| 68 |
+
[2026-01-12 17:32:31 root] (main.py 107): INFO wikitext2
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173513.txt
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 17:35:13 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 17:35:13 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 17:35:13 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 17:35:14 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 17:35:32 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 17:35:38 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 17:35:43 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 17:36:33 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 17:37:07 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173832.txt
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 17:38:32 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 17:38:32 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 17:38:32 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 17:38:32 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 17:38:51 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 17:38:59 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 17:39:02 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 17:39:44 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 17:40:16 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_181953.txt
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-01-12 18:19:53 root] (args_utils.py 169): INFO Arguments:
|
| 2 |
+
[2026-01-12 18:19:53 root] (args_utils.py 170): INFO {'a_asym': False,
|
| 3 |
+
'a_bits': 4,
|
| 4 |
+
'a_groupsize': 128,
|
| 5 |
+
'act_order': False,
|
| 6 |
+
'add_diag': True,
|
| 7 |
+
'cali_bsz': 4,
|
| 8 |
+
'cali_dataset': 'wikitext2',
|
| 9 |
+
'cali_trans': True,
|
| 10 |
+
'deactive_amp': False,
|
| 11 |
+
'diag_alpha': 0.3,
|
| 12 |
+
'diag_init': 'sq_style',
|
| 13 |
+
'direct_inv': False,
|
| 14 |
+
'distribute_model': False,
|
| 15 |
+
'epochs': 15,
|
| 16 |
+
'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
|
| 17 |
+
'exp_name': 'exp',
|
| 18 |
+
'flat_lr': 0.005,
|
| 19 |
+
'gptq': False,
|
| 20 |
+
'gptq_mse': False,
|
| 21 |
+
'hf_token': None,
|
| 22 |
+
'k_asym': False,
|
| 23 |
+
'k_bits': 16,
|
| 24 |
+
'k_groupsize': -1,
|
| 25 |
+
'lac': True,
|
| 26 |
+
'lm_eval': False,
|
| 27 |
+
'lm_eval_batch_size': 128,
|
| 28 |
+
'lwc': True,
|
| 29 |
+
'matrix_path': None,
|
| 30 |
+
'model': 'Qwen/Qwen3-8B',
|
| 31 |
+
'model_name': 'Qwen3-8B',
|
| 32 |
+
'nsamples': 128,
|
| 33 |
+
'output_dir': './outputs',
|
| 34 |
+
'percdamp': 0.01,
|
| 35 |
+
'q_asym': False,
|
| 36 |
+
'q_bits': 16,
|
| 37 |
+
'q_groupsize': -1,
|
| 38 |
+
'quantize': True,
|
| 39 |
+
'quantized_save': False,
|
| 40 |
+
'reload_matrix': False,
|
| 41 |
+
'resume': True,
|
| 42 |
+
'save_matrix': True,
|
| 43 |
+
'save_qmodel_path': './qmodel/Qwen3-8B',
|
| 44 |
+
'seed': 0,
|
| 45 |
+
'separate_vtrans': False,
|
| 46 |
+
'tasks': ['piqa',
|
| 47 |
+
'hellaswag',
|
| 48 |
+
'arc_easy',
|
| 49 |
+
'arc_challenge',
|
| 50 |
+
'winogrande',
|
| 51 |
+
'lambada_openai'],
|
| 52 |
+
'v_asym': False,
|
| 53 |
+
'v_bits': 16,
|
| 54 |
+
'v_groupsize': -1,
|
| 55 |
+
'w_asym': False,
|
| 56 |
+
'w_bits': 4,
|
| 57 |
+
'w_groupsize': 128,
|
| 58 |
+
'warmup': False}
|
| 59 |
+
[2026-01-12 18:19:53 root] (args_utils.py 171): INFO ------------------------------------------------------------
|
| 60 |
+
[2026-01-12 18:19:54 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 61 |
+
[2026-01-12 18:20:13 root] (main.py 25): INFO Finished loading training data.
|
| 62 |
+
[2026-01-12 18:20:20 root] (main.py 29): INFO Finished applying FlatQuant to model.
|
| 63 |
+
[2026-01-12 18:20:24 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
|
| 64 |
+
[2026-01-12 18:21:14 root] (main.py 39): INFO Finished reparameterize model.
|
| 65 |
+
[2026-01-12 18:21:59 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
|
| 66 |
+
[2026-01-12 18:22:02 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
|
| 67 |
+
[2026-01-12 18:22:27 root] (main.py 104): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
|
| 68 |
+
[2026-01-12 18:22:44 root] (main.py 113): INFO wikitext2
|
outputs/Qwen3-8B/w4a4/exp/model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31756e3da4017b19338662bd1e6ea8d157c287c4d303910f92c897188a79399a
|
| 3 |
+
size 4734049352
|
outputs/Qwen3-8B/w4a4/exp/model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf1344bde98672393a75f1eac7d77cd94eb9324696c83fb9d771278c7bbf9b52
|
| 3 |
+
size 1461808272
|
outputs/Qwen3-8B/w4a4/exp/model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/Qwen3-8B/w4a4/exp/quantization_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"w_bits": 4,
|
| 3 |
+
"model_name": "Qwen/Qwen3-8B",
|
| 4 |
+
"symmetric": true,
|
| 5 |
+
"format": "packed_int4",
|
| 6 |
+
"sharded": true
|
| 7 |
+
}
|
outputs/Qwen3-8B/w4a4/exp/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
|
| 3 |
+
size 11422654
|
outputs/Qwen3-8B/w4a4/exp/tokenizer_config.json
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"additional_special_tokens": [
|
| 215 |
+
"<|im_start|>",
|
| 216 |
+
"<|im_end|>",
|
| 217 |
+
"<|object_ref_start|>",
|
| 218 |
+
"<|object_ref_end|>",
|
| 219 |
+
"<|box_start|>",
|
| 220 |
+
"<|box_end|>",
|
| 221 |
+
"<|quad_start|>",
|
| 222 |
+
"<|quad_end|>",
|
| 223 |
+
"<|vision_start|>",
|
| 224 |
+
"<|vision_end|>",
|
| 225 |
+
"<|vision_pad|>",
|
| 226 |
+
"<|image_pad|>",
|
| 227 |
+
"<|video_pad|>"
|
| 228 |
+
],
|
| 229 |
+
"bos_token": null,
|
| 230 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
|
| 231 |
+
"clean_up_tokenization_spaces": false,
|
| 232 |
+
"eos_token": "<|im_end|>",
|
| 233 |
+
"errors": "replace",
|
| 234 |
+
"model_max_length": 131072,
|
| 235 |
+
"pad_token": "<|endoftext|>",
|
| 236 |
+
"split_special_tokens": false,
|
| 237 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 238 |
+
"unk_token": null
|
| 239 |
+
}
|