SubSir commited on
Commit
ffb6b3c
·
verified ·
1 Parent(s): cc0a884

Upload folder: outputs/Qwen3-8B

Browse files
Files changed (31) hide show
  1. .gitattributes +1 -0
  2. outputs/Qwen3-8B/w4a4/exp/config.json +45 -0
  3. outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth +3 -0
  4. outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth +3 -0
  5. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162158.txt +58 -0
  6. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162240.txt +59 -0
  7. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162627.txt +61 -0
  8. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162858.txt +59 -0
  9. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_163842.txt +61 -0
  10. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164042.txt +62 -0
  11. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164538.txt +135 -0
  12. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165516.txt +106 -0
  13. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165858.txt +79 -0
  14. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_184025.txt +675 -0
  15. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_195354.txt +680 -0
  16. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260109_092702.txt +680 -0
  17. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_062728.txt +63 -0
  18. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_063624.txt +63 -0
  19. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_155601.txt +65 -0
  20. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_160154.txt +70 -0
  21. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_163532.txt +68 -0
  22. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173005.txt +68 -0
  23. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173513.txt +65 -0
  24. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173832.txt +65 -0
  25. outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_181953.txt +68 -0
  26. outputs/Qwen3-8B/w4a4/exp/model-00001-of-00002.safetensors +3 -0
  27. outputs/Qwen3-8B/w4a4/exp/model-00002-of-00002.safetensors +3 -0
  28. outputs/Qwen3-8B/w4a4/exp/model.safetensors.index.json +0 -0
  29. outputs/Qwen3-8B/w4a4/exp/quantization_config.json +7 -0
  30. outputs/Qwen3-8B/w4a4/exp/tokenizer.json +3 -0
  31. outputs/Qwen3-8B/w4a4/exp/tokenizer_config.json +239 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ outputs/Qwen3-8B/w4a4/exp/tokenizer.json filter=lfs diff=lfs merge=lfs -text
outputs/Qwen3-8B/w4a4/exp/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3FlatQuantForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 12288,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 36,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 36,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.51.0",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936,
30
+ "fake_quant_config": {
31
+ "w_bits": 4,
32
+ "a_bits": 4,
33
+ "a_asym": false,
34
+ "w_asym": false,
35
+ "k_bits": 16,
36
+ "k_asym": false,
37
+ "k_groupsize": -1,
38
+ "v_bits": 16,
39
+ "v_asym": false,
40
+ "v_groupsize": -1,
41
+ "lwc": true,
42
+ "lac": true,
43
+ "direct_inv": false
44
+ }
45
+ }
outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea4be9a10ff9947ffff53ff39806eb983423776916352323e46f9eec6ed60dbc
3
+ size 31799687
outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5aeb255e7df9875349b4ebdf66b04278604545d1a123fd1979a142ba3e487f4
3
+ size 32004907
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162158.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:21:58 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:21:58 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen3/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:21:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162240.txt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:22:40 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:22:40 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:22:40 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:22:41 root] (model_utils.py 92): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162627.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:26:27 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:26:27 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:26:27 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:26:28 root] (model_utils.py 92): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:26:51 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:26:56 root] (main.py 29): INFO Finished applying FlatQuant to model.
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_162858.txt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:28:58 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:28:58 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:28:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:28:58 root] (model_utils.py 81): ERROR Qwen3 model is not available. Error: attempted relative import with no known parent package
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_163842.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:38:42 root] (args_utils.py 158): INFO Arguments:
2
+ [2026-01-08 16:38:42 root] (args_utils.py 159): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:38:42 root] (args_utils.py 160): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:38:43 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:39:04 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:39:09 root] (main.py 29): INFO Finished applying FlatQuant to model.
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164042.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:40:42 root] (args_utils.py 158): INFO Arguments:
2
+ [2026-01-08 16:40:42 root] (args_utils.py 159): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:40:42 root] (args_utils.py 160): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:40:43 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:41:03 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:41:08 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 16:41:11 root] (train_utils.py 99): INFO ========= Layer 0 =========
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_164538.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:45:38 root] (args_utils.py 158): INFO Arguments:
2
+ [2026-01-08 16:45:38 root] (args_utils.py 159): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:45:38 root] (args_utils.py 160): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:45:39 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:45:59 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:46:04 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 16:46:06 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-08 16:46:14 root] (train_utils.py 181): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.177632s, mse: 0.75320721
64
+ [2026-01-08 16:46:18 root] (train_utils.py 181): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.951230s, mse: 0.69738150
65
+ [2026-01-08 16:46:22 root] (train_utils.py 181): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.918307s, mse: 0.57322526
66
+ [2026-01-08 16:46:26 root] (train_utils.py 181): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.922915s, mse: 0.53385043
67
+ [2026-01-08 16:46:30 root] (train_utils.py 181): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.897071s, mse: 0.52587473
68
+ [2026-01-08 16:46:34 root] (train_utils.py 181): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.904243s, mse: 0.52103043
69
+ [2026-01-08 16:46:38 root] (train_utils.py 181): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.906013s, mse: 0.51764816
70
+ [2026-01-08 16:46:42 root] (train_utils.py 181): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.934398s, mse: 0.51576799
71
+ [2026-01-08 16:46:46 root] (train_utils.py 181): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.926790s, mse: 0.51471919
72
+ [2026-01-08 16:46:50 root] (train_utils.py 181): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.909371s, mse: 0.51408356
73
+ [2026-01-08 16:46:54 root] (train_utils.py 181): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.908848s, mse: 0.51356357
74
+ [2026-01-08 16:46:57 root] (train_utils.py 181): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.920370s, mse: 0.51325279
75
+ [2026-01-08 16:47:01 root] (train_utils.py 181): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.914565s, mse: 0.51308525
76
+ [2026-01-08 16:47:05 root] (train_utils.py 181): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.907758s, mse: 0.51298046
77
+ [2026-01-08 16:47:09 root] (train_utils.py 181): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.903909s, mse: 0.51291251
78
+ [2026-01-08 16:47:10 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-08 16:47:10 root] (train_utils.py 108): INFO ========= Layer 1 =========
80
+ [2026-01-08 16:47:17 root] (train_utils.py 181): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.529751s, mse: 1.26446903
81
+ [2026-01-08 16:47:21 root] (train_utils.py 181): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.905758s, mse: 1.20416105
82
+ [2026-01-08 16:47:25 root] (train_utils.py 181): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.919909s, mse: 1.32050300
83
+ [2026-01-08 16:47:29 root] (train_utils.py 181): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.911430s, mse: 1.18387961
84
+ [2026-01-08 16:47:33 root] (train_utils.py 181): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.908307s, mse: 1.16144323
85
+ [2026-01-08 16:47:37 root] (train_utils.py 181): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.900176s, mse: 1.14692831
86
+ [2026-01-08 16:47:41 root] (train_utils.py 181): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.908509s, mse: 1.13803911
87
+ [2026-01-08 16:47:44 root] (train_utils.py 181): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.990242s, mse: 1.13248944
88
+ [2026-01-08 16:47:48 root] (train_utils.py 181): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.922920s, mse: 1.12851596
89
+ [2026-01-08 16:47:52 root] (train_utils.py 181): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.909056s, mse: 1.12513459
90
+ [2026-01-08 16:47:56 root] (train_utils.py 181): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.911661s, mse: 1.12304866
91
+ [2026-01-08 16:48:00 root] (train_utils.py 181): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.920923s, mse: 1.12149227
92
+ [2026-01-08 16:48:04 root] (train_utils.py 181): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.930339s, mse: 1.12047637
93
+ [2026-01-08 16:48:08 root] (train_utils.py 181): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.923637s, mse: 1.12013018
94
+ [2026-01-08 16:48:12 root] (train_utils.py 181): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.901148s, mse: 1.12002420
95
+ [2026-01-08 16:48:12 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
96
+ [2026-01-08 16:48:13 root] (train_utils.py 108): INFO ========= Layer 2 =========
97
+ [2026-01-08 16:48:20 root] (train_utils.py 181): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 4.564884s, mse: 2.01507950
98
+ [2026-01-08 16:48:24 root] (train_utils.py 181): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.932034s, mse: 1.94718516
99
+ [2026-01-08 16:48:28 root] (train_utils.py 181): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.938544s, mse: 1.88504732
100
+ [2026-01-08 16:48:31 root] (train_utils.py 181): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.911523s, mse: 1.81488454
101
+ [2026-01-08 16:48:35 root] (train_utils.py 181): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.927961s, mse: 1.85724211
102
+ [2026-01-08 16:48:39 root] (train_utils.py 181): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.927059s, mse: 2.01470947
103
+ [2026-01-08 16:48:43 root] (train_utils.py 181): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.914749s, mse: 1.76976871
104
+ [2026-01-08 16:48:47 root] (train_utils.py 181): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.930597s, mse: 1.76208174
105
+ [2026-01-08 16:48:51 root] (train_utils.py 181): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.942161s, mse: 1.75805795
106
+ [2026-01-08 16:48:55 root] (train_utils.py 181): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.925775s, mse: 1.75492477
107
+ [2026-01-08 16:48:59 root] (train_utils.py 181): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.927086s, mse: 1.75177002
108
+ [2026-01-08 16:49:03 root] (train_utils.py 181): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.916509s, mse: 1.74904215
109
+ [2026-01-08 16:49:07 root] (train_utils.py 181): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.925464s, mse: 1.74770069
110
+ [2026-01-08 16:49:11 root] (train_utils.py 181): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.931881s, mse: 1.74607742
111
+ [2026-01-08 16:49:15 root] (train_utils.py 181): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.918606s, mse: 1.74535310
112
+ [2026-01-08 16:49:15 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
113
+ [2026-01-08 16:49:16 root] (train_utils.py 108): INFO ========= Layer 3 =========
114
+ [2026-01-08 16:49:22 root] (train_utils.py 181): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 4.469205s, mse: 3.37112665
115
+ [2026-01-08 16:49:26 root] (train_utils.py 181): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.908183s, mse: 3.28124046
116
+ [2026-01-08 16:49:30 root] (train_utils.py 181): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.881730s, mse: 3.07249451
117
+ [2026-01-08 16:49:34 root] (train_utils.py 181): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.889869s, mse: 2.85796380
118
+ [2026-01-08 16:49:38 root] (train_utils.py 181): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.935233s, mse: 2.88136601
119
+ [2026-01-08 16:49:42 root] (train_utils.py 181): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.948762s, mse: 3.00021911
120
+ [2026-01-08 16:49:46 root] (train_utils.py 181): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.965647s, mse: 2.90009570
121
+ [2026-01-08 16:49:50 root] (train_utils.py 181): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.905676s, mse: 2.84187627
122
+ [2026-01-08 16:49:54 root] (train_utils.py 181): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.916992s, mse: 3.88529181
123
+ [2026-01-08 16:49:58 root] (train_utils.py 181): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.918374s, mse: 2.84166765
124
+ [2026-01-08 16:50:02 root] (train_utils.py 181): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.926870s, mse: 2.83363008
125
+ [2026-01-08 16:50:05 root] (train_utils.py 181): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.899929s, mse: 2.82961488
126
+ [2026-01-08 16:50:09 root] (train_utils.py 181): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.909861s, mse: 2.82789564
127
+ [2026-01-08 16:50:13 root] (train_utils.py 181): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.883598s, mse: 2.82704329
128
+ [2026-01-08 16:50:17 root] (train_utils.py 181): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.906606s, mse: 2.82679415
129
+ [2026-01-08 16:50:18 root] (train_utils.py 187): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
130
+ [2026-01-08 16:50:18 root] (train_utils.py 108): INFO ========= Layer 4 =========
131
+ [2026-01-08 16:50:25 root] (train_utils.py 181): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 4.617949s, mse: 5.87795258
132
+ [2026-01-08 16:50:29 root] (train_utils.py 181): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.927140s, mse: 5.58140898
133
+ [2026-01-08 16:50:33 root] (train_utils.py 181): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.943356s, mse: 5.40157461
134
+ [2026-01-08 16:50:37 root] (train_utils.py 181): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.934676s, mse: 4.97706127
135
+ [2026-01-08 16:50:41 root] (train_utils.py 181): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.936047s, mse: 4.83699369
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165516.txt ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:55:16 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:55:16 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': -1,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': -1,
57
+ 'warmup': False}
58
+ [2026-01-08 16:55:16 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:55:17 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:55:38 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:55:42 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 16:55:45 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-08 16:55:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.095334s, mse: 0.02583671
64
+ [2026-01-08 16:55:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.934147s, mse: 0.01396359
65
+ [2026-01-08 16:56:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.939300s, mse: 0.01044113
66
+ [2026-01-08 16:56:05 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.908403s, mse: 0.00969208
67
+ [2026-01-08 16:56:09 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.915706s, mse: 0.00940374
68
+ [2026-01-08 16:56:13 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.912972s, mse: 0.00924401
69
+ [2026-01-08 16:56:17 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.921335s, mse: 0.00912234
70
+ [2026-01-08 16:56:21 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.932727s, mse: 0.00903957
71
+ [2026-01-08 16:56:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 4.433334s, mse: 0.00895381
72
+ [2026-01-08 16:56:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.907690s, mse: 0.00888840
73
+ [2026-01-08 16:56:33 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.904034s, mse: 0.00881676
74
+ [2026-01-08 16:56:37 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.927204s, mse: 0.00877623
75
+ [2026-01-08 16:56:41 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.925085s, mse: 0.00874014
76
+ [2026-01-08 16:56:45 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.916118s, mse: 0.00871035
77
+ [2026-01-08 16:56:48 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.921848s, mse: 0.00869927
78
+ [2026-01-08 16:56:49 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-08 16:56:49 root] (train_utils.py 108): INFO ========= Layer 1 =========
80
+ [2026-01-08 16:56:56 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.588893s, mse: 0.01656580
81
+ [2026-01-08 16:57:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 4.451948s, mse: 0.00616941
82
+ [2026-01-08 16:57:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.968735s, mse: 0.00441587
83
+ [2026-01-08 16:57:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 4.437059s, mse: 0.00399985
84
+ [2026-01-08 16:57:13 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 4.038846s, mse: 0.00383769
85
+ [2026-01-08 16:57:17 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.925880s, mse: 0.00373424
86
+ [2026-01-08 16:57:21 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.923243s, mse: 0.00368217
87
+ [2026-01-08 16:57:25 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.938122s, mse: 0.00363120
88
+ [2026-01-08 16:57:29 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.932140s, mse: 0.00357347
89
+ [2026-01-08 16:57:33 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.947205s, mse: 0.00352856
90
+ [2026-01-08 16:57:37 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.939108s, mse: 0.00350962
91
+ [2026-01-08 16:57:41 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 4.007798s, mse: 0.00346109
92
+ [2026-01-08 16:57:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.940766s, mse: 0.00342262
93
+ [2026-01-08 16:57:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.952263s, mse: 0.00341267
94
+ [2026-01-08 16:57:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.938083s, mse: 0.00340322
95
+ [2026-01-08 16:57:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
96
+ [2026-01-08 16:57:54 root] (train_utils.py 108): INFO ========= Layer 2 =========
97
+ [2026-01-08 16:58:00 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 4.587668s, mse: 0.03411270
98
+ [2026-01-08 16:58:04 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.952559s, mse: 0.00820368
99
+ [2026-01-08 16:58:08 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.936083s, mse: 0.00562381
100
+ [2026-01-08 16:58:12 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.948321s, mse: 0.00503509
101
+ [2026-01-08 16:58:16 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.930907s, mse: 0.00482035
102
+ [2026-01-08 16:58:20 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.918012s, mse: 0.00470118
103
+ [2026-01-08 16:58:24 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.927189s, mse: 0.00463578
104
+ [2026-01-08 16:58:28 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.935595s, mse: 0.00459153
105
+ [2026-01-08 16:58:32 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.921202s, mse: 0.00453731
106
+ [2026-01-08 16:58:36 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.939279s, mse: 0.00450525
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_165858.txt ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 16:58:58 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 16:58:58 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-08 16:58:58 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 16:58:59 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 16:59:21 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 16:59:26 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 16:59:29 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-08 16:59:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 6.397410s, mse: 0.01574295
64
+ [2026-01-08 16:59:44 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 4.447838s, mse: 0.01115426
65
+ [2026-01-08 16:59:48 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 4.593246s, mse: 0.00938093
66
+ [2026-01-08 16:59:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 4.590999s, mse: 0.00881439
67
+ [2026-01-08 16:59:58 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 4.522616s, mse: 0.00857142
68
+ [2026-01-08 17:00:02 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 4.538843s, mse: 0.00849318
69
+ [2026-01-08 17:00:07 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 4.488467s, mse: 0.00832680
70
+ [2026-01-08 17:00:11 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 4.497394s, mse: 0.00828776
71
+ [2026-01-08 17:00:16 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 4.503586s, mse: 0.00818714
72
+ [2026-01-08 17:00:20 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 4.574569s, mse: 0.00813103
73
+ [2026-01-08 17:00:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 4.571058s, mse: 0.00808381
74
+ [2026-01-08 17:00:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 4.520602s, mse: 0.00804329
75
+ [2026-01-08 17:00:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 4.617018s, mse: 0.00799941
76
+ [2026-01-08 17:00:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 4.615621s, mse: 0.00795571
77
+ [2026-01-08 17:00:43 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 4.499076s, mse: 0.00794016
78
+ [2026-01-08 17:00:44 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-08 17:00:44 root] (train_utils.py 108): INFO ========= Layer 1 =========
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_184025.txt ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 18:40:25 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 18:40:25 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': True,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-08 18:40:25 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 18:40:28 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 18:40:44 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 18:40:53 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 18:40:57 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-08 18:41:07 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 6.342341s, mse: 0.01574295
64
+ [2026-01-08 18:41:11 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.926945s, mse: 0.01115426
65
+ [2026-01-08 18:41:14 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.866743s, mse: 0.00938093
66
+ [2026-01-08 18:41:18 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.864751s, mse: 0.00881439
67
+ [2026-01-08 18:41:22 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.862903s, mse: 0.00857142
68
+ [2026-01-08 18:41:26 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.863009s, mse: 0.00849318
69
+ [2026-01-08 18:41:30 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.868398s, mse: 0.00832680
70
+ [2026-01-08 18:41:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.862813s, mse: 0.00828776
71
+ [2026-01-08 18:41:38 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.860975s, mse: 0.00818714
72
+ [2026-01-08 18:41:41 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.861234s, mse: 0.00813103
73
+ [2026-01-08 18:41:45 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.870577s, mse: 0.00808381
74
+ [2026-01-08 18:41:49 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.860093s, mse: 0.00804329
75
+ [2026-01-08 18:41:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.875212s, mse: 0.00799941
76
+ [2026-01-08 18:41:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.869323s, mse: 0.00795571
77
+ [2026-01-08 18:42:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.871119s, mse: 0.00794016
78
+ [2026-01-08 18:42:01 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-08 18:42:02 root] (train_utils.py 108): INFO ========= Layer 1 =========
80
+ [2026-01-08 18:42:11 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 5.915874s, mse: 0.00892038
81
+ [2026-01-08 18:42:15 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.927274s, mse: 0.00479663
82
+ [2026-01-08 18:42:18 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.867823s, mse: 0.00384854
83
+ [2026-01-08 18:42:22 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.867914s, mse: 0.00355465
84
+ [2026-01-08 18:42:26 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.868877s, mse: 0.00343135
85
+ [2026-01-08 18:42:30 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.867419s, mse: 0.00337971
86
+ [2026-01-08 18:42:34 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.868710s, mse: 0.00336636
87
+ [2026-01-08 18:42:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.870240s, mse: 0.00329515
88
+ [2026-01-08 18:42:42 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.870138s, mse: 0.00326379
89
+ [2026-01-08 18:42:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.872112s, mse: 0.00321724
90
+ [2026-01-08 18:42:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.867391s, mse: 0.00316591
91
+ [2026-01-08 18:42:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.864894s, mse: 0.00313276
92
+ [2026-01-08 18:42:57 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.868528s, mse: 0.00310469
93
+ [2026-01-08 18:43:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.865803s, mse: 0.00308243
94
+ [2026-01-08 18:43:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.869683s, mse: 0.00306749
95
+ [2026-01-08 18:43:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
96
+ [2026-01-08 18:43:06 root] (train_utils.py 108): INFO ========= Layer 2 =========
97
+ [2026-01-08 18:43:14 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 5.276760s, mse: 0.01750460
98
+ [2026-01-08 18:43:18 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.925459s, mse: 0.00626545
99
+ [2026-01-08 18:43:22 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.866118s, mse: 0.00494380
100
+ [2026-01-08 18:43:26 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.866142s, mse: 0.00453308
101
+ [2026-01-08 18:43:29 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.870597s, mse: 0.00439964
102
+ [2026-01-08 18:43:33 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.871755s, mse: 0.00429795
103
+ [2026-01-08 18:43:37 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.868093s, mse: 0.00425246
104
+ [2026-01-08 18:43:41 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.871027s, mse: 0.00420888
105
+ [2026-01-08 18:43:45 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.869353s, mse: 0.00415287
106
+ [2026-01-08 18:43:49 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.868892s, mse: 0.00411024
107
+ [2026-01-08 18:43:53 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.875470s, mse: 0.00407672
108
+ [2026-01-08 18:43:57 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.868368s, mse: 0.00404750
109
+ [2026-01-08 18:44:00 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.870399s, mse: 0.00401742
110
+ [2026-01-08 18:44:04 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.868037s, mse: 0.00398090
111
+ [2026-01-08 18:44:08 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.879722s, mse: 0.00397130
112
+ [2026-01-08 18:44:09 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
113
+ [2026-01-08 18:44:09 root] (train_utils.py 108): INFO ========= Layer 3 =========
114
+ [2026-01-08 18:44:17 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 5.277266s, mse: 0.02308414
115
+ [2026-01-08 18:44:21 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.869237s, mse: 0.01333557
116
+ [2026-01-08 18:44:25 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.873277s, mse: 0.01099337
117
+ [2026-01-08 18:44:29 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.871180s, mse: 0.01028412
118
+ [2026-01-08 18:44:33 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.871590s, mse: 0.01000082
119
+ [2026-01-08 18:44:36 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.869860s, mse: 0.00980410
120
+ [2026-01-08 18:44:40 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.869903s, mse: 0.00969286
121
+ [2026-01-08 18:44:44 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.872837s, mse: 0.00956387
122
+ [2026-01-08 18:44:48 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.871547s, mse: 0.00946260
123
+ [2026-01-08 18:44:52 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.866354s, mse: 0.00937346
124
+ [2026-01-08 18:44:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.870575s, mse: 0.00926330
125
+ [2026-01-08 18:45:00 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.865113s, mse: 0.00916464
126
+ [2026-01-08 18:45:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.873380s, mse: 0.00907166
127
+ [2026-01-08 18:45:07 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.872867s, mse: 0.00904066
128
+ [2026-01-08 18:45:11 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.859612s, mse: 0.00900416
129
+ [2026-01-08 18:45:12 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
130
+ [2026-01-08 18:45:13 root] (train_utils.py 108): INFO ========= Layer 4 =========
131
+ [2026-01-08 18:45:20 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 5.013705s, mse: 0.06576648
132
+ [2026-01-08 18:45:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.866042s, mse: 0.03741666
133
+ [2026-01-08 18:45:28 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.867324s, mse: 0.03053248
134
+ [2026-01-08 18:45:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.867816s, mse: 0.02855516
135
+ [2026-01-08 18:45:35 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.874113s, mse: 0.02790034
136
+ [2026-01-08 18:45:39 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 3.864290s, mse: 0.02746365
137
+ [2026-01-08 18:45:43 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.874358s, mse: 0.02716962
138
+ [2026-01-08 18:45:47 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.866786s, mse: 0.02687641
139
+ [2026-01-08 18:45:51 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.866034s, mse: 0.02662238
140
+ [2026-01-08 18:45:55 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 3.867874s, mse: 0.02643147
141
+ [2026-01-08 18:45:59 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 3.875141s, mse: 0.02624781
142
+ [2026-01-08 18:46:03 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 3.867632s, mse: 0.02604026
143
+ [2026-01-08 18:46:06 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 3.868149s, mse: 0.02585863
144
+ [2026-01-08 18:46:10 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 3.867380s, mse: 0.02578292
145
+ [2026-01-08 18:46:14 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 3.871570s, mse: 0.02572995
146
+ [2026-01-08 18:46:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
147
+ [2026-01-08 18:46:15 root] (train_utils.py 108): INFO ========= Layer 5 =========
148
+ [2026-01-08 18:46:23 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.259219s, mse: 0.13743916
149
+ [2026-01-08 18:46:27 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 3.869206s, mse: 0.08057592
150
+ [2026-01-08 18:46:31 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 3.877397s, mse: 0.06617787
151
+ [2026-01-08 18:46:35 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 3.875794s, mse: 0.06287611
152
+ [2026-01-08 18:46:38 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 3.874708s, mse: 0.06213523
153
+ [2026-01-08 18:46:42 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.873983s, mse: 0.06160403
154
+ [2026-01-08 18:46:46 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.868648s, mse: 0.06119698
155
+ [2026-01-08 18:46:50 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.871142s, mse: 0.06094177
156
+ [2026-01-08 18:46:54 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.869253s, mse: 0.06060794
157
+ [2026-01-08 18:46:58 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.880099s, mse: 0.06020888
158
+ [2026-01-08 18:47:02 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.872482s, mse: 0.05995716
159
+ [2026-01-08 18:47:06 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.875441s, mse: 0.05978661
160
+ [2026-01-08 18:47:09 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.871503s, mse: 0.05955682
161
+ [2026-01-08 18:47:13 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.870681s, mse: 0.05938030
162
+ [2026-01-08 18:47:17 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.873357s, mse: 0.05934311
163
+ [2026-01-08 18:47:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
164
+ [2026-01-08 18:47:18 root] (train_utils.py 108): INFO ========= Layer 6 =========
165
+ [2026-01-08 18:47:26 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 5.163689s, mse: 1.86451793
166
+ [2026-01-08 18:47:30 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.873172s, mse: 0.35658583
167
+ [2026-01-08 18:47:34 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.877536s, mse: 0.32737118
168
+ [2026-01-08 18:47:38 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.867511s, mse: 0.28929594
169
+ [2026-01-08 18:47:41 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.878428s, mse: 0.24128482
170
+ [2026-01-08 18:47:45 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.864721s, mse: 0.21027605
171
+ [2026-01-08 18:47:49 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.872345s, mse: 0.25483868
172
+ [2026-01-08 18:47:53 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.869268s, mse: 0.23871142
173
+ [2026-01-08 18:47:57 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.874523s, mse: 0.21885920
174
+ [2026-01-08 18:48:01 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.866698s, mse: 0.20672695
175
+ [2026-01-08 18:48:05 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.875635s, mse: 0.20202750
176
+ [2026-01-08 18:48:09 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.868720s, mse: 0.17932597
177
+ [2026-01-08 18:48:12 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.877253s, mse: 0.20257902
178
+ [2026-01-08 18:48:16 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.873001s, mse: 0.20667967
179
+ [2026-01-08 18:48:20 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.868689s, mse: 0.16777667
180
+ [2026-01-08 18:48:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
181
+ [2026-01-08 18:48:21 root] (train_utils.py 108): INFO ========= Layer 7 =========
182
+ [2026-01-08 18:48:29 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 5.284689s, mse: 0.23462753
183
+ [2026-01-08 18:48:33 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.870807s, mse: 0.14976017
184
+ [2026-01-08 18:48:37 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.875834s, mse: 0.12312289
185
+ [2026-01-08 18:48:41 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.870820s, mse: 0.11779824
186
+ [2026-01-08 18:48:44 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.873519s, mse: 0.11621600
187
+ [2026-01-08 18:48:48 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.879462s, mse: 0.11538153
188
+ [2026-01-08 18:48:52 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.869727s, mse: 0.11461711
189
+ [2026-01-08 18:48:56 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.870542s, mse: 0.11396322
190
+ [2026-01-08 18:49:00 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.872142s, mse: 0.11346199
191
+ [2026-01-08 18:49:04 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.875134s, mse: 0.11303829
192
+ [2026-01-08 18:49:08 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 3.868972s, mse: 0.11244514
193
+ [2026-01-08 18:49:12 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 3.871574s, mse: 0.11193727
194
+ [2026-01-08 18:49:15 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 3.900703s, mse: 0.11167257
195
+ [2026-01-08 18:49:19 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 3.875596s, mse: 0.11139309
196
+ [2026-01-08 18:49:23 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 3.875573s, mse: 0.11127126
197
+ [2026-01-08 18:49:24 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
198
+ [2026-01-08 18:49:24 root] (train_utils.py 108): INFO ========= Layer 8 =========
199
+ [2026-01-08 18:49:32 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 5.331074s, mse: 0.31783378
200
+ [2026-01-08 18:49:36 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 4.026908s, mse: 0.21154313
201
+ [2026-01-08 18:49:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 3.870493s, mse: 0.17556834
202
+ [2026-01-08 18:49:44 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 3.876034s, mse: 0.16892871
203
+ [2026-01-08 18:49:48 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 3.871941s, mse: 0.16700211
204
+ [2026-01-08 18:49:51 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 3.867337s, mse: 0.16594610
205
+ [2026-01-08 18:49:55 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 3.865289s, mse: 0.16510613
206
+ [2026-01-08 18:49:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 3.873836s, mse: 0.16456470
207
+ [2026-01-08 18:50:03 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 3.868187s, mse: 0.16401851
208
+ [2026-01-08 18:50:07 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 3.874494s, mse: 0.16352586
209
+ [2026-01-08 18:50:11 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 3.868160s, mse: 0.16331530
210
+ [2026-01-08 18:50:15 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 3.871973s, mse: 0.16285881
211
+ [2026-01-08 18:50:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 3.868864s, mse: 0.16254890
212
+ [2026-01-08 18:50:22 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 3.867946s, mse: 0.16240378
213
+ [2026-01-08 18:50:26 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 3.872199s, mse: 0.16246043
214
+ [2026-01-08 18:50:27 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
215
+ [2026-01-08 18:50:28 root] (train_utils.py 108): INFO ========= Layer 9 =========
216
+ [2026-01-08 18:50:35 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 5.142299s, mse: 0.37875688
217
+ [2026-01-08 18:50:39 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 3.869491s, mse: 0.25363240
218
+ [2026-01-08 18:50:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 3.869034s, mse: 0.21064380
219
+ [2026-01-08 18:50:47 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 3.872067s, mse: 0.20179385
220
+ [2026-01-08 18:50:51 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 3.880042s, mse: 0.19936548
221
+ [2026-01-08 18:50:55 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.866734s, mse: 0.19817175
222
+ [2026-01-08 18:50:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 3.873047s, mse: 0.19703594
223
+ [2026-01-08 18:51:02 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 3.874506s, mse: 0.19626960
224
+ [2026-01-08 18:51:06 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 3.874818s, mse: 0.19534998
225
+ [2026-01-08 18:51:10 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 3.871551s, mse: 0.19473058
226
+ [2026-01-08 18:51:14 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 3.871144s, mse: 0.19404019
227
+ [2026-01-08 18:51:18 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 3.871020s, mse: 0.19356999
228
+ [2026-01-08 18:51:22 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 3.869848s, mse: 0.19326007
229
+ [2026-01-08 18:51:26 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 3.865149s, mse: 0.19282311
230
+ [2026-01-08 18:51:30 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 3.872090s, mse: 0.19267595
231
+ [2026-01-08 18:51:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
232
+ [2026-01-08 18:51:31 root] (train_utils.py 108): INFO ========= Layer 10 =========
233
+ [2026-01-08 18:51:38 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 5.212747s, mse: 0.44592521
234
+ [2026-01-08 18:51:42 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.870292s, mse: 0.28058022
235
+ [2026-01-08 18:51:46 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.876544s, mse: 0.22870731
236
+ [2026-01-08 18:51:50 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 3.870140s, mse: 0.21672769
237
+ [2026-01-08 18:51:54 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 3.867480s, mse: 0.21354958
238
+ [2026-01-08 18:51:58 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 3.874588s, mse: 0.21149486
239
+ [2026-01-08 18:52:02 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 3.870549s, mse: 0.21045262
240
+ [2026-01-08 18:52:05 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 3.872663s, mse: 0.20926467
241
+ [2026-01-08 18:52:09 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 3.873691s, mse: 0.20823501
242
+ [2026-01-08 18:52:13 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.869580s, mse: 0.20746952
243
+ [2026-01-08 18:52:17 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.872574s, mse: 0.20690618
244
+ [2026-01-08 18:52:21 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.870390s, mse: 0.20613439
245
+ [2026-01-08 18:52:25 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.867468s, mse: 0.20562243
246
+ [2026-01-08 18:52:29 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 3.874285s, mse: 0.20517452
247
+ [2026-01-08 18:52:33 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 3.867692s, mse: 0.20504668
248
+ [2026-01-08 18:52:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
249
+ [2026-01-08 18:52:34 root] (train_utils.py 108): INFO ========= Layer 11 =========
250
+ [2026-01-08 18:52:42 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 5.363442s, mse: 0.39262417
251
+ [2026-01-08 18:52:46 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 3.927743s, mse: 0.27127978
252
+ [2026-01-08 18:52:49 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 3.871058s, mse: 0.22630122
253
+ [2026-01-08 18:52:53 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 3.867345s, mse: 0.21789221
254
+ [2026-01-08 18:52:57 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 3.870425s, mse: 0.21573043
255
+ [2026-01-08 18:53:01 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 3.883979s, mse: 0.21401882
256
+ [2026-01-08 18:53:05 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 3.875270s, mse: 0.21313243
257
+ [2026-01-08 18:53:09 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 3.878757s, mse: 0.21215978
258
+ [2026-01-08 18:53:13 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 3.868496s, mse: 0.21121168
259
+ [2026-01-08 18:53:17 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.869603s, mse: 0.21032479
260
+ [2026-01-08 18:53:20 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.874183s, mse: 0.20987187
261
+ [2026-01-08 18:53:24 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.864224s, mse: 0.20908046
262
+ [2026-01-08 18:53:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 3.869574s, mse: 0.20848191
263
+ [2026-01-08 18:53:32 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 3.873634s, mse: 0.20800886
264
+ [2026-01-08 18:53:36 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 3.872741s, mse: 0.20795538
265
+ [2026-01-08 18:53:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
266
+ [2026-01-08 18:53:37 root] (train_utils.py 108): INFO ========= Layer 12 =========
267
+ [2026-01-08 18:53:45 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 5.307302s, mse: 0.43535280
268
+ [2026-01-08 18:53:49 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 3.867751s, mse: 0.29579335
269
+ [2026-01-08 18:53:53 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 3.869383s, mse: 0.24488190
270
+ [2026-01-08 18:53:57 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 3.869865s, mse: 0.23438135
271
+ [2026-01-08 18:54:01 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 3.875870s, mse: 0.23133603
272
+ [2026-01-08 18:54:04 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 3.872000s, mse: 0.22933656
273
+ [2026-01-08 18:54:08 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 3.880461s, mse: 0.22804067
274
+ [2026-01-08 18:54:12 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 3.874234s, mse: 0.22690852
275
+ [2026-01-08 18:54:16 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.875051s, mse: 0.22579126
276
+ [2026-01-08 18:54:20 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.877198s, mse: 0.22475064
277
+ [2026-01-08 18:54:24 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.872357s, mse: 0.22366890
278
+ [2026-01-08 18:54:28 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.870901s, mse: 0.22277188
279
+ [2026-01-08 18:54:32 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.872854s, mse: 0.22196589
280
+ [2026-01-08 18:54:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.894168s, mse: 0.22144113
281
+ [2026-01-08 18:54:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.869174s, mse: 0.22116731
282
+ [2026-01-08 18:54:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
283
+ [2026-01-08 18:54:41 root] (train_utils.py 108): INFO ========= Layer 13 =========
284
+ [2026-01-08 18:54:48 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 5.302742s, mse: 0.44991863
285
+ [2026-01-08 18:54:52 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.929768s, mse: 0.30773303
286
+ [2026-01-08 18:54:56 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.868324s, mse: 0.25602528
287
+ [2026-01-08 18:55:00 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.871515s, mse: 0.24593170
288
+ [2026-01-08 18:55:04 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.868756s, mse: 0.24332635
289
+ [2026-01-08 18:55:08 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.875814s, mse: 0.24169515
290
+ [2026-01-08 18:55:12 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.877859s, mse: 0.24032030
291
+ [2026-01-08 18:55:16 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.871221s, mse: 0.23895445
292
+ [2026-01-08 18:55:19 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.870597s, mse: 0.23795472
293
+ [2026-01-08 18:55:23 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.872128s, mse: 0.23691620
294
+ [2026-01-08 18:55:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.870418s, mse: 0.23617835
295
+ [2026-01-08 18:55:31 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 3.875942s, mse: 0.23538260
296
+ [2026-01-08 18:55:35 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 3.872318s, mse: 0.23459788
297
+ [2026-01-08 18:55:39 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 3.874342s, mse: 0.23386008
298
+ [2026-01-08 18:55:43 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 3.876184s, mse: 0.23347831
299
+ [2026-01-08 18:55:43 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
300
+ [2026-01-08 18:55:44 root] (train_utils.py 108): INFO ========= Layer 14 =========
301
+ [2026-01-08 18:55:52 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 5.314241s, mse: 0.48670265
302
+ [2026-01-08 18:55:56 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.883879s, mse: 0.32924685
303
+ [2026-01-08 18:55:59 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 3.871489s, mse: 0.27174610
304
+ [2026-01-08 18:56:03 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 3.877281s, mse: 0.26111004
305
+ [2026-01-08 18:56:07 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 3.878602s, mse: 0.25857583
306
+ [2026-01-08 18:56:11 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 3.884447s, mse: 0.25724220
307
+ [2026-01-08 18:56:15 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 3.877262s, mse: 0.25530052
308
+ [2026-01-08 18:56:19 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 3.876701s, mse: 0.25373703
309
+ [2026-01-08 18:56:23 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.869868s, mse: 0.25232333
310
+ [2026-01-08 18:56:27 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 3.867889s, mse: 0.25103748
311
+ [2026-01-08 18:56:30 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 3.876401s, mse: 0.24987648
312
+ [2026-01-08 18:56:34 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 3.875775s, mse: 0.24912813
313
+ [2026-01-08 18:56:38 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 3.870832s, mse: 0.24813016
314
+ [2026-01-08 18:56:42 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 3.870428s, mse: 0.24762598
315
+ [2026-01-08 18:56:46 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.866288s, mse: 0.24739194
316
+ [2026-01-08 18:56:46 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
317
+ [2026-01-08 18:56:47 root] (train_utils.py 108): INFO ========= Layer 15 =========
318
+ [2026-01-08 18:56:56 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 5.946069s, mse: 0.48941827
319
+ [2026-01-08 18:57:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 3.964455s, mse: 0.32720220
320
+ [2026-01-08 18:57:03 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 3.909539s, mse: 0.26854873
321
+ [2026-01-08 18:57:07 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 3.873296s, mse: 0.25705975
322
+ [2026-01-08 18:57:11 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 3.876359s, mse: 0.25422159
323
+ [2026-01-08 18:57:15 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 3.876583s, mse: 0.25197345
324
+ [2026-01-08 18:57:19 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 3.870843s, mse: 0.25026903
325
+ [2026-01-08 18:57:23 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.871294s, mse: 0.24867499
326
+ [2026-01-08 18:57:27 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 3.868954s, mse: 0.24771519
327
+ [2026-01-08 18:57:31 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 3.875140s, mse: 0.24665023
328
+ [2026-01-08 18:57:34 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 3.873682s, mse: 0.24558856
329
+ [2026-01-08 18:57:38 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 3.872494s, mse: 0.24435455
330
+ [2026-01-08 18:57:42 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 3.873586s, mse: 0.24346027
331
+ [2026-01-08 18:57:46 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 3.890473s, mse: 0.24292424
332
+ [2026-01-08 18:57:50 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 3.872211s, mse: 0.24260354
333
+ [2026-01-08 18:57:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
334
+ [2026-01-08 18:57:51 root] (train_utils.py 108): INFO ========= Layer 16 =========
335
+ [2026-01-08 18:57:58 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 4.626063s, mse: 3.09758520
336
+ [2026-01-08 18:58:02 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.892791s, mse: 1.53681600
337
+ [2026-01-08 18:58:06 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.875230s, mse: 1.37538433
338
+ [2026-01-08 18:58:10 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.887590s, mse: 1.14041376
339
+ [2026-01-08 18:58:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 3.873685s, mse: 1.13041377
340
+ [2026-01-08 18:58:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 3.886949s, mse: 1.17505825
341
+ [2026-01-08 18:58:21 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 3.875163s, mse: 1.00187659
342
+ [2026-01-08 18:58:25 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 3.873014s, mse: 1.15916288
343
+ [2026-01-08 18:58:29 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 3.870400s, mse: 0.93556213
344
+ [2026-01-08 18:58:33 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 3.883701s, mse: 0.89307052
345
+ [2026-01-08 18:58:37 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.874257s, mse: 1.08854449
346
+ [2026-01-08 18:58:41 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.872130s, mse: 0.78587675
347
+ [2026-01-08 18:58:45 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.872023s, mse: 0.77024889
348
+ [2026-01-08 18:58:49 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.877707s, mse: 0.74143833
349
+ [2026-01-08 18:58:52 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.870870s, mse: 0.62904388
350
+ [2026-01-08 18:58:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
351
+ [2026-01-08 18:58:54 root] (train_utils.py 108): INFO ========= Layer 17 =========
352
+ [2026-01-08 18:59:02 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 5.444665s, mse: 0.57632238
353
+ [2026-01-08 18:59:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 3.880440s, mse: 0.38568184
354
+ [2026-01-08 18:59:09 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 3.868829s, mse: 0.30990756
355
+ [2026-01-08 18:59:13 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 3.874089s, mse: 0.29348093
356
+ [2026-01-08 18:59:17 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 3.880884s, mse: 0.28841209
357
+ [2026-01-08 18:59:21 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 3.874481s, mse: 0.28536177
358
+ [2026-01-08 18:59:25 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 3.876625s, mse: 0.28336507
359
+ [2026-01-08 18:59:29 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 3.883069s, mse: 0.28023016
360
+ [2026-01-08 18:59:33 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 3.871980s, mse: 0.27797151
361
+ [2026-01-08 18:59:37 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 3.875011s, mse: 0.27724716
362
+ [2026-01-08 18:59:40 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.871064s, mse: 0.27549568
363
+ [2026-01-08 18:59:44 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.874846s, mse: 0.27411795
364
+ [2026-01-08 18:59:48 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.875467s, mse: 0.27230272
365
+ [2026-01-08 18:59:52 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.871872s, mse: 0.27161792
366
+ [2026-01-08 18:59:56 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 3.877031s, mse: 0.27142629
367
+ [2026-01-08 18:59:56 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
368
+ [2026-01-08 18:59:57 root] (train_utils.py 108): INFO ========= Layer 18 =========
369
+ [2026-01-08 19:00:05 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 5.037441s, mse: 0.68219566
370
+ [2026-01-08 19:00:09 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 3.873358s, mse: 0.44933167
371
+ [2026-01-08 19:00:13 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 3.869275s, mse: 0.36149144
372
+ [2026-01-08 19:00:17 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 3.869983s, mse: 0.34437451
373
+ [2026-01-08 19:00:21 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 3.867804s, mse: 0.33928376
374
+ [2026-01-08 19:00:24 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 3.872184s, mse: 0.33628541
375
+ [2026-01-08 19:00:28 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 3.868960s, mse: 0.33380261
376
+ [2026-01-08 19:00:32 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 3.872158s, mse: 0.33132178
377
+ [2026-01-08 19:00:36 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 3.871630s, mse: 0.32943395
378
+ [2026-01-08 19:00:40 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 3.873256s, mse: 0.32786560
379
+ [2026-01-08 19:00:44 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.874281s, mse: 0.32583937
380
+ [2026-01-08 19:00:48 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.869690s, mse: 0.32450172
381
+ [2026-01-08 19:00:52 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.871679s, mse: 0.32264820
382
+ [2026-01-08 19:00:55 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.878042s, mse: 0.32187557
383
+ [2026-01-08 19:00:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.873233s, mse: 0.32105669
384
+ [2026-01-08 19:01:00 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
385
+ [2026-01-08 19:01:01 root] (train_utils.py 108): INFO ========= Layer 19 =========
386
+ [2026-01-08 19:01:10 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 5.786413s, mse: 0.88728219
387
+ [2026-01-08 19:01:14 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 4.102961s, mse: 0.57078516
388
+ [2026-01-08 19:01:18 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 3.876380s, mse: 0.45792666
389
+ [2026-01-08 19:01:22 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.871944s, mse: 0.43537480
390
+ [2026-01-08 19:01:25 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.870567s, mse: 0.42894897
391
+ [2026-01-08 19:01:29 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.872926s, mse: 0.42462113
392
+ [2026-01-08 19:01:33 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.871444s, mse: 0.42157629
393
+ [2026-01-08 19:01:37 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.871675s, mse: 0.41864219
394
+ [2026-01-08 19:01:41 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.871620s, mse: 0.41570342
395
+ [2026-01-08 19:01:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.872640s, mse: 0.41345572
396
+ [2026-01-08 19:01:49 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.872133s, mse: 0.41054672
397
+ [2026-01-08 19:01:52 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.871707s, mse: 0.40846488
398
+ [2026-01-08 19:01:56 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.876148s, mse: 0.40727249
399
+ [2026-01-08 19:02:00 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.881454s, mse: 0.40628025
400
+ [2026-01-08 19:02:04 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 3.875313s, mse: 0.40573606
401
+ [2026-01-08 19:02:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
402
+ [2026-01-08 19:02:06 root] (train_utils.py 108): INFO ========= Layer 20 =========
403
+ [2026-01-08 19:02:15 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 5.941118s, mse: 0.88836050
404
+ [2026-01-08 19:02:19 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 3.928517s, mse: 0.59483135
405
+ [2026-01-08 19:02:23 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 3.871794s, mse: 0.48579982
406
+ [2026-01-08 19:02:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.873976s, mse: 0.46583182
407
+ [2026-01-08 19:02:31 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 3.873488s, mse: 0.46044937
408
+ [2026-01-08 19:02:34 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 3.871722s, mse: 0.45749170
409
+ [2026-01-08 19:02:38 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 3.879350s, mse: 0.45316568
410
+ [2026-01-08 19:02:42 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 3.870425s, mse: 0.45053339
411
+ [2026-01-08 19:02:46 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 3.879096s, mse: 0.44832462
412
+ [2026-01-08 19:02:50 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 3.866863s, mse: 0.44616416
413
+ [2026-01-08 19:02:54 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.871922s, mse: 0.44334349
414
+ [2026-01-08 19:02:58 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 3.874037s, mse: 0.44204527
415
+ [2026-01-08 19:03:02 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 3.872909s, mse: 0.43987796
416
+ [2026-01-08 19:03:05 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 3.871884s, mse: 0.43863490
417
+ [2026-01-08 19:03:09 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 3.880204s, mse: 0.43791217
418
+ [2026-01-08 19:03:10 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
419
+ [2026-01-08 19:03:11 root] (train_utils.py 108): INFO ========= Layer 21 =========
420
+ [2026-01-08 19:03:20 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.436487s, mse: 1.18043423
421
+ [2026-01-08 19:03:24 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.929286s, mse: 0.77954561
422
+ [2026-01-08 19:03:27 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 3.871185s, mse: 0.64111829
423
+ [2026-01-08 19:03:31 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 3.875395s, mse: 0.61397409
424
+ [2026-01-08 19:03:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 3.866674s, mse: 0.60631013
425
+ [2026-01-08 19:03:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 3.872060s, mse: 0.60047567
426
+ [2026-01-08 19:03:43 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 3.872387s, mse: 0.59512597
427
+ [2026-01-08 19:03:47 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 3.870912s, mse: 0.59215677
428
+ [2026-01-08 19:03:51 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 3.874108s, mse: 0.58796024
429
+ [2026-01-08 19:03:55 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 3.873752s, mse: 0.58513182
430
+ [2026-01-08 19:03:58 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.870490s, mse: 0.58225924
431
+ [2026-01-08 19:04:02 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 3.874893s, mse: 0.57988369
432
+ [2026-01-08 19:04:06 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 3.876416s, mse: 0.57718277
433
+ [2026-01-08 19:04:10 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 3.874704s, mse: 0.57546204
434
+ [2026-01-08 19:04:14 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 3.996245s, mse: 0.57469940
435
+ [2026-01-08 19:04:14 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
436
+ [2026-01-08 19:04:16 root] (train_utils.py 108): INFO ========= Layer 22 =========
437
+ [2026-01-08 19:04:23 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.016886s, mse: 1.88664389
438
+ [2026-01-08 19:04:27 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 3.875092s, mse: 1.18959606
439
+ [2026-01-08 19:04:31 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 3.873603s, mse: 0.95907360
440
+ [2026-01-08 19:04:35 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.870588s, mse: 0.91428280
441
+ [2026-01-08 19:04:39 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.879905s, mse: 0.90376323
442
+ [2026-01-08 19:04:43 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.871378s, mse: 0.89363086
443
+ [2026-01-08 19:04:47 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 3.873839s, mse: 0.88751125
444
+ [2026-01-08 19:04:51 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 3.877680s, mse: 0.87932986
445
+ [2026-01-08 19:04:54 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 3.870394s, mse: 0.87506205
446
+ [2026-01-08 19:04:58 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 3.868981s, mse: 0.86960399
447
+ [2026-01-08 19:05:02 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 3.871379s, mse: 0.86433518
448
+ [2026-01-08 19:05:06 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 3.873498s, mse: 0.85831034
449
+ [2026-01-08 19:05:10 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.875648s, mse: 0.85434479
450
+ [2026-01-08 19:05:14 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.873622s, mse: 0.85274106
451
+ [2026-01-08 19:05:18 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.876666s, mse: 0.85105854
452
+ [2026-01-08 19:05:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
453
+ [2026-01-08 19:05:19 root] (train_utils.py 108): INFO ========= Layer 23 =========
454
+ [2026-01-08 19:05:27 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 5.240867s, mse: 2.56160784
455
+ [2026-01-08 19:05:31 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 3.870888s, mse: 1.69400561
456
+ [2026-01-08 19:05:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 3.871142s, mse: 1.40092814
457
+ [2026-01-08 19:05:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 3.871503s, mse: 1.33960748
458
+ [2026-01-08 19:05:42 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 3.876095s, mse: 1.31923652
459
+ [2026-01-08 19:05:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 3.883029s, mse: 1.30260742
460
+ [2026-01-08 19:05:50 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 3.871961s, mse: 1.29341400
461
+ [2026-01-08 19:05:54 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 3.867985s, mse: 1.28473794
462
+ [2026-01-08 19:05:58 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 3.873707s, mse: 1.27725101
463
+ [2026-01-08 19:06:02 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 3.868931s, mse: 1.27071691
464
+ [2026-01-08 19:06:05 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 3.874512s, mse: 1.26552820
465
+ [2026-01-08 19:06:09 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.869352s, mse: 1.26018000
466
+ [2026-01-08 19:06:13 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.874362s, mse: 1.25696874
467
+ [2026-01-08 19:06:17 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.875837s, mse: 1.25348544
468
+ [2026-01-08 19:06:21 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 3.874819s, mse: 1.25113153
469
+ [2026-01-08 19:06:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
470
+ [2026-01-08 19:06:22 root] (train_utils.py 108): INFO ========= Layer 24 =========
471
+ [2026-01-08 19:06:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 5.198261s, mse: 3.33080626
472
+ [2026-01-08 19:06:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 3.873992s, mse: 2.21739531
473
+ [2026-01-08 19:06:38 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 3.884522s, mse: 1.83558488
474
+ [2026-01-08 19:06:42 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 3.872766s, mse: 1.75192118
475
+ [2026-01-08 19:06:45 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 3.874427s, mse: 1.73021388
476
+ [2026-01-08 19:06:49 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 3.869450s, mse: 1.70965135
477
+ [2026-01-08 19:06:53 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 3.871820s, mse: 1.69753647
478
+ [2026-01-08 19:06:57 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 3.876565s, mse: 1.68364048
479
+ [2026-01-08 19:07:01 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 3.880944s, mse: 1.67123342
480
+ [2026-01-08 19:07:05 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 3.879658s, mse: 1.66224420
481
+ [2026-01-08 19:07:09 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 3.870554s, mse: 1.65476453
482
+ [2026-01-08 19:07:13 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.873933s, mse: 1.64498436
483
+ [2026-01-08 19:07:16 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 3.881927s, mse: 1.63647079
484
+ [2026-01-08 19:07:20 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 3.875716s, mse: 1.63291585
485
+ [2026-01-08 19:07:25 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 4.456192s, mse: 1.63007939
486
+ [2026-01-08 19:07:25 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
487
+ [2026-01-08 19:07:27 root] (train_utils.py 108): INFO ========= Layer 25 =========
488
+ [2026-01-08 19:07:35 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 5.174078s, mse: 3.67945337
489
+ [2026-01-08 19:07:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.928041s, mse: 2.39840055
490
+ [2026-01-08 19:07:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.927209s, mse: 2.00158238
491
+ [2026-01-08 19:07:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.886188s, mse: 1.92655563
492
+ [2026-01-08 19:07:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.873621s, mse: 1.90741169
493
+ [2026-01-08 19:07:54 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.874353s, mse: 1.89064825
494
+ [2026-01-08 19:07:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.867891s, mse: 1.88254857
495
+ [2026-01-08 19:08:02 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.872960s, mse: 1.87189174
496
+ [2026-01-08 19:08:06 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 3.890402s, mse: 1.86226833
497
+ [2026-01-08 19:08:10 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.876318s, mse: 1.85414529
498
+ [2026-01-08 19:08:13 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.869507s, mse: 1.84632003
499
+ [2026-01-08 19:08:17 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.872166s, mse: 1.83962476
500
+ [2026-01-08 19:08:21 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.871724s, mse: 1.83272731
501
+ [2026-01-08 19:08:25 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 3.872951s, mse: 1.83188641
502
+ [2026-01-08 19:08:29 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 3.873922s, mse: 1.82856822
503
+ [2026-01-08 19:08:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
504
+ [2026-01-08 19:08:30 root] (train_utils.py 108): INFO ========= Layer 26 =========
505
+ [2026-01-08 19:08:38 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.560466s, mse: 4.35819054
506
+ [2026-01-08 19:08:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 3.936876s, mse: 2.94494462
507
+ [2026-01-08 19:08:46 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 3.871977s, mse: 2.46222878
508
+ [2026-01-08 19:08:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 3.870587s, mse: 2.36697221
509
+ [2026-01-08 19:08:54 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.872370s, mse: 2.34871936
510
+ [2026-01-08 19:08:58 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 3.873634s, mse: 2.33013940
511
+ [2026-01-08 19:09:01 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 3.881063s, mse: 2.31725478
512
+ [2026-01-08 19:09:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 3.873708s, mse: 2.30295658
513
+ [2026-01-08 19:09:09 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 3.880649s, mse: 2.29171467
514
+ [2026-01-08 19:09:13 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 3.878683s, mse: 2.28112888
515
+ [2026-01-08 19:09:17 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.870284s, mse: 2.27260423
516
+ [2026-01-08 19:09:21 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.876089s, mse: 2.26187754
517
+ [2026-01-08 19:09:25 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 3.881677s, mse: 2.25517917
518
+ [2026-01-08 19:09:29 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 3.872640s, mse: 2.24800634
519
+ [2026-01-08 19:09:32 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 3.872203s, mse: 2.24403787
520
+ [2026-01-08 19:09:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
521
+ [2026-01-08 19:09:34 root] (train_utils.py 108): INFO ========= Layer 27 =========
522
+ [2026-01-08 19:09:42 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 5.206716s, mse: 5.94560862
523
+ [2026-01-08 19:09:45 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.875546s, mse: 3.95834851
524
+ [2026-01-08 19:09:49 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 3.878089s, mse: 3.32281756
525
+ [2026-01-08 19:09:53 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 3.869850s, mse: 3.18086267
526
+ [2026-01-08 19:09:57 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 3.880871s, mse: 3.14467168
527
+ [2026-01-08 19:10:01 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 3.874153s, mse: 3.12000346
528
+ [2026-01-08 19:10:05 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 3.872086s, mse: 3.09776139
529
+ [2026-01-08 19:10:09 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 3.870448s, mse: 3.07834363
530
+ [2026-01-08 19:10:12 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 3.872187s, mse: 3.06277657
531
+ [2026-01-08 19:10:16 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 3.868123s, mse: 3.04591680
532
+ [2026-01-08 19:10:20 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.875916s, mse: 3.03134632
533
+ [2026-01-08 19:10:24 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 3.873957s, mse: 3.01916480
534
+ [2026-01-08 19:10:28 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 3.869634s, mse: 3.00719571
535
+ [2026-01-08 19:10:32 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 3.868931s, mse: 2.99984956
536
+ [2026-01-08 19:10:36 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 3.875341s, mse: 2.99120903
537
+ [2026-01-08 19:10:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
538
+ [2026-01-08 19:10:37 root] (train_utils.py 108): INFO ========= Layer 28 =========
539
+ [2026-01-08 19:10:45 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 5.016971s, mse: 8.40579605
540
+ [2026-01-08 19:10:48 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 3.877848s, mse: 5.55529737
541
+ [2026-01-08 19:10:52 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 3.868832s, mse: 4.64479589
542
+ [2026-01-08 19:10:56 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 3.866925s, mse: 4.46341419
543
+ [2026-01-08 19:11:00 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.882068s, mse: 4.40386772
544
+ [2026-01-08 19:11:04 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.872863s, mse: 4.37245226
545
+ [2026-01-08 19:11:08 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 3.871567s, mse: 4.34240580
546
+ [2026-01-08 19:11:12 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 3.868959s, mse: 4.31763363
547
+ [2026-01-08 19:11:15 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 3.876480s, mse: 4.29854107
548
+ [2026-01-08 19:11:19 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 3.870713s, mse: 4.28071547
549
+ [2026-01-08 19:11:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 3.866973s, mse: 4.26679897
550
+ [2026-01-08 19:11:27 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 3.869495s, mse: 4.24268007
551
+ [2026-01-08 19:11:31 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 3.870375s, mse: 4.22641373
552
+ [2026-01-08 19:11:35 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.871195s, mse: 4.22128248
553
+ [2026-01-08 19:11:39 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.868412s, mse: 4.21494389
554
+ [2026-01-08 19:11:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
555
+ [2026-01-08 19:11:39 root] (train_utils.py 108): INFO ========= Layer 29 =========
556
+ [2026-01-08 19:11:47 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 5.232268s, mse: 10.38746834
557
+ [2026-01-08 19:11:51 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 4.035455s, mse: 7.14648628
558
+ [2026-01-08 19:11:55 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 3.977416s, mse: 6.03318691
559
+ [2026-01-08 19:11:59 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 4.021685s, mse: 5.78764057
560
+ [2026-01-08 19:12:03 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 3.906455s, mse: 5.71550655
561
+ [2026-01-08 19:12:07 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 3.877396s, mse: 5.66473246
562
+ [2026-01-08 19:12:11 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 3.871670s, mse: 5.61916113
563
+ [2026-01-08 19:12:14 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 3.873803s, mse: 5.58458805
564
+ [2026-01-08 19:12:18 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 3.873369s, mse: 5.54784393
565
+ [2026-01-08 19:12:22 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 3.873344s, mse: 5.52231646
566
+ [2026-01-08 19:12:26 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 3.877249s, mse: 5.48976994
567
+ [2026-01-08 19:12:30 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.912101s, mse: 5.46507311
568
+ [2026-01-08 19:12:34 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.878016s, mse: 5.44575977
569
+ [2026-01-08 19:12:38 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 3.873230s, mse: 5.43577242
570
+ [2026-01-08 19:12:42 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 3.871523s, mse: 5.42604542
571
+ [2026-01-08 19:12:42 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
572
+ [2026-01-08 19:12:43 root] (train_utils.py 108): INFO ========= Layer 30 =========
573
+ [2026-01-08 19:12:50 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 4.795969s, mse: 16.29405975
574
+ [2026-01-08 19:12:54 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 3.870167s, mse: 11.01632500
575
+ [2026-01-08 19:12:58 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 3.869613s, mse: 9.27882481
576
+ [2026-01-08 19:13:02 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 3.875138s, mse: 8.87542439
577
+ [2026-01-08 19:13:06 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 3.935740s, mse: 8.75351048
578
+ [2026-01-08 19:13:09 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 3.915037s, mse: 8.65880680
579
+ [2026-01-08 19:13:13 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 3.883668s, mse: 8.60634327
580
+ [2026-01-08 19:13:17 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 3.902720s, mse: 8.53597736
581
+ [2026-01-08 19:13:21 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 3.875391s, mse: 8.50352001
582
+ [2026-01-08 19:13:25 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 3.871288s, mse: 8.44190311
583
+ [2026-01-08 19:13:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.872681s, mse: 8.40491486
584
+ [2026-01-08 19:13:33 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.878355s, mse: 8.38511753
585
+ [2026-01-08 19:13:37 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.878202s, mse: 8.35692787
586
+ [2026-01-08 19:13:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.878534s, mse: 8.35674667
587
+ [2026-01-08 19:13:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.874162s, mse: 8.34408569
588
+ [2026-01-08 19:13:45 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
589
+ [2026-01-08 19:13:45 root] (train_utils.py 108): INFO ========= Layer 31 =========
590
+ [2026-01-08 19:13:53 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 5.028024s, mse: 20.78250885
591
+ [2026-01-08 19:13:57 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 3.877883s, mse: 14.37235165
592
+ [2026-01-08 19:14:01 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.880755s, mse: 12.13233566
593
+ [2026-01-08 19:14:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.879653s, mse: 11.62570667
594
+ [2026-01-08 19:14:08 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.874181s, mse: 11.51362991
595
+ [2026-01-08 19:14:12 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.873015s, mse: 11.42485142
596
+ [2026-01-08 19:14:16 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.880170s, mse: 11.33607769
597
+ [2026-01-08 19:14:20 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.871797s, mse: 11.27843571
598
+ [2026-01-08 19:14:24 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.874252s, mse: 11.22037888
599
+ [2026-01-08 19:14:28 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.875672s, mse: 11.15839195
600
+ [2026-01-08 19:14:32 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.878424s, mse: 11.12734127
601
+ [2026-01-08 19:14:35 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.873094s, mse: 11.08810806
602
+ [2026-01-08 19:14:39 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.871928s, mse: 11.05513668
603
+ [2026-01-08 19:14:43 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 3.875564s, mse: 11.03436947
604
+ [2026-01-08 19:14:47 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 3.871252s, mse: 11.01393795
605
+ [2026-01-08 19:14:48 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
606
+ [2026-01-08 19:14:48 root] (train_utils.py 108): INFO ========= Layer 32 =========
607
+ [2026-01-08 19:14:56 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.154165s, mse: 28.37956429
608
+ [2026-01-08 19:15:00 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 3.881288s, mse: 19.76789856
609
+ [2026-01-08 19:15:04 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 3.875616s, mse: 16.61169624
610
+ [2026-01-08 19:15:08 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.874239s, mse: 15.88970184
611
+ [2026-01-08 19:15:12 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 3.879297s, mse: 15.74769402
612
+ [2026-01-08 19:15:16 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 3.874795s, mse: 15.61922455
613
+ [2026-01-08 19:15:19 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 3.876008s, mse: 15.51004982
614
+ [2026-01-08 19:15:23 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 3.878464s, mse: 15.42904854
615
+ [2026-01-08 19:15:27 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 3.874979s, mse: 15.34880447
616
+ [2026-01-08 19:15:31 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.878422s, mse: 15.27359772
617
+ [2026-01-08 19:15:35 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.886444s, mse: 15.21441174
618
+ [2026-01-08 19:15:39 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 3.877920s, mse: 15.16252708
619
+ [2026-01-08 19:15:43 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 3.875979s, mse: 15.10843849
620
+ [2026-01-08 19:15:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 3.876947s, mse: 15.08382893
621
+ [2026-01-08 19:15:50 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 3.878299s, mse: 15.06546974
622
+ [2026-01-08 19:15:51 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
623
+ [2026-01-08 19:15:52 root] (train_utils.py 108): INFO ========= Layer 33 =========
624
+ [2026-01-08 19:15:59 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 5.236542s, mse: 41.54327011
625
+ [2026-01-08 19:16:03 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 3.938019s, mse: 27.93664551
626
+ [2026-01-08 19:16:07 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 3.878417s, mse: 23.32941628
627
+ [2026-01-08 19:16:11 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 3.872800s, mse: 22.34293175
628
+ [2026-01-08 19:16:15 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 3.874553s, mse: 22.07669640
629
+ [2026-01-08 19:16:19 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 3.874539s, mse: 21.87960243
630
+ [2026-01-08 19:16:23 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 3.869958s, mse: 21.73635674
631
+ [2026-01-08 19:16:27 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 3.875664s, mse: 21.58724403
632
+ [2026-01-08 19:16:30 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 3.880439s, mse: 21.46766853
633
+ [2026-01-08 19:16:34 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.872697s, mse: 21.36098099
634
+ [2026-01-08 19:16:38 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 3.876025s, mse: 21.27636719
635
+ [2026-01-08 19:16:42 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 3.871703s, mse: 21.16030693
636
+ [2026-01-08 19:16:46 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 3.870745s, mse: 21.07536125
637
+ [2026-01-08 19:16:50 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 3.876155s, mse: 20.99114990
638
+ [2026-01-08 19:16:54 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 3.873363s, mse: 20.95961761
639
+ [2026-01-08 19:16:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
640
+ [2026-01-08 19:16:55 root] (train_utils.py 108): INFO ========= Layer 34 =========
641
+ [2026-01-08 19:17:03 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 4.991384s, mse: 64.93594360
642
+ [2026-01-08 19:17:07 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 3.943139s, mse: 40.86461258
643
+ [2026-01-08 19:17:10 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 3.868030s, mse: 33.65349960
644
+ [2026-01-08 19:17:14 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.868735s, mse: 31.96302605
645
+ [2026-01-08 19:17:18 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.873104s, mse: 31.66926384
646
+ [2026-01-08 19:17:22 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 3.873229s, mse: 31.07656479
647
+ [2026-01-08 19:17:26 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 3.873526s, mse: 30.91048813
648
+ [2026-01-08 19:17:30 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 3.875315s, mse: 30.05115700
649
+ [2026-01-08 19:17:34 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 3.879331s, mse: 29.89023590
650
+ [2026-01-08 19:17:38 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 3.873674s, mse: 30.35319901
651
+ [2026-01-08 19:17:41 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 3.871441s, mse: 29.46559715
652
+ [2026-01-08 19:17:45 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 3.869557s, mse: 29.05239487
653
+ [2026-01-08 19:17:49 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.872165s, mse: 28.86521339
654
+ [2026-01-08 19:17:53 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.871504s, mse: 28.74409676
655
+ [2026-01-08 19:17:57 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 3.877051s, mse: 28.70412636
656
+ [2026-01-08 19:17:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
657
+ [2026-01-08 19:17:58 root] (train_utils.py 108): INFO ========= Layer 35 =========
658
+ [2026-01-08 19:18:06 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 5.117816s, mse: 108.25781250
659
+ [2026-01-08 19:18:10 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 3.871820s, mse: 38.04971313
660
+ [2026-01-08 19:18:13 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 3.875288s, mse: 31.63025665
661
+ [2026-01-08 19:18:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 3.870218s, mse: 29.21376991
662
+ [2026-01-08 19:18:21 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 3.873308s, mse: 28.19089508
663
+ [2026-01-08 19:18:25 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 3.872198s, mse: 28.40728760
664
+ [2026-01-08 19:18:29 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 3.873228s, mse: 27.74842644
665
+ [2026-01-08 19:18:33 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 3.872646s, mse: 27.13273811
666
+ [2026-01-08 19:18:37 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 3.887236s, mse: 26.53238487
667
+ [2026-01-08 19:18:41 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 3.929309s, mse: 26.14052200
668
+ [2026-01-08 19:18:44 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.869573s, mse: 25.63203621
669
+ [2026-01-08 19:18:48 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.877343s, mse: 25.35079384
670
+ [2026-01-08 19:18:52 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 3.877298s, mse: 25.21109390
671
+ [2026-01-08 19:18:56 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 3.884227s, mse: 24.95710945
672
+ [2026-01-08 19:19:00 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 3.953963s, mse: 24.85692596
673
+ [2026-01-08 19:19:01 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
674
+ [2026-01-08 19:19:38 root] (main.py 39): INFO Finished reparameterize model.
675
+ [2026-01-08 19:20:04 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260108_195354.txt ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-08 19:53:54 root] (args_utils.py 159): INFO Arguments:
2
+ [2026-01-08 19:53:54 root] (args_utils.py 160): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': True,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-08 19:53:54 root] (args_utils.py 161): INFO ------------------------------------------------------------
59
+ [2026-01-08 19:53:55 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-08 19:54:11 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-08 19:54:16 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-08 19:54:19 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-08 19:54:27 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.696289s, mse: 0.01574295
64
+ [2026-01-08 19:54:31 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.970096s, mse: 0.01115426
65
+ [2026-01-08 19:54:35 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.904277s, mse: 0.00938093
66
+ [2026-01-08 19:54:39 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.879331s, mse: 0.00881439
67
+ [2026-01-08 19:54:43 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.879196s, mse: 0.00857142
68
+ [2026-01-08 19:54:47 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.875969s, mse: 0.00849318
69
+ [2026-01-08 19:54:51 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.881118s, mse: 0.00832680
70
+ [2026-01-08 19:54:54 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.879780s, mse: 0.00828776
71
+ [2026-01-08 19:54:58 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.879514s, mse: 0.00818714
72
+ [2026-01-08 19:55:02 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.959016s, mse: 0.00813103
73
+ [2026-01-08 19:55:06 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 4.002184s, mse: 0.00808381
74
+ [2026-01-08 19:55:10 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.974104s, mse: 0.00804329
75
+ [2026-01-08 19:55:14 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.941929s, mse: 0.00799941
76
+ [2026-01-08 19:55:18 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.913412s, mse: 0.00795571
77
+ [2026-01-08 19:55:22 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.888769s, mse: 0.00794016
78
+ [2026-01-08 19:55:22 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-08 19:55:23 root] (train_utils.py 108): INFO ========= Layer 1 =========
80
+ [2026-01-08 19:55:30 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.934166s, mse: 0.00892038
81
+ [2026-01-08 19:55:34 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.895962s, mse: 0.00479663
82
+ [2026-01-08 19:55:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.882093s, mse: 0.00384854
83
+ [2026-01-08 19:55:42 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.879302s, mse: 0.00355465
84
+ [2026-01-08 19:55:46 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.870608s, mse: 0.00343135
85
+ [2026-01-08 19:55:50 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.872881s, mse: 0.00337971
86
+ [2026-01-08 19:55:54 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.874244s, mse: 0.00336636
87
+ [2026-01-08 19:55:58 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 3.877368s, mse: 0.00329515
88
+ [2026-01-08 19:56:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 3.871886s, mse: 0.00326379
89
+ [2026-01-08 19:56:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 3.888601s, mse: 0.00321724
90
+ [2026-01-08 19:56:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 3.873849s, mse: 0.00316591
91
+ [2026-01-08 19:56:13 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 3.885014s, mse: 0.00313276
92
+ [2026-01-08 19:56:17 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 3.890614s, mse: 0.00310469
93
+ [2026-01-08 19:56:21 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.876605s, mse: 0.00308243
94
+ [2026-01-08 19:56:25 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 3.877022s, mse: 0.00306749
95
+ [2026-01-08 19:56:25 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
96
+ [2026-01-08 19:56:26 root] (train_utils.py 108): INFO ========= Layer 2 =========
97
+ [2026-01-08 19:56:34 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 5.188274s, mse: 0.01750460
98
+ [2026-01-08 19:56:38 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 3.876876s, mse: 0.00626545
99
+ [2026-01-08 19:56:42 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 3.873263s, mse: 0.00494380
100
+ [2026-01-08 19:56:46 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 3.874986s, mse: 0.00453308
101
+ [2026-01-08 19:56:49 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 3.879791s, mse: 0.00439964
102
+ [2026-01-08 19:56:53 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 3.872402s, mse: 0.00429795
103
+ [2026-01-08 19:56:57 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 3.880752s, mse: 0.00425246
104
+ [2026-01-08 19:57:01 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 3.871940s, mse: 0.00420888
105
+ [2026-01-08 19:57:05 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 3.878927s, mse: 0.00415287
106
+ [2026-01-08 19:57:09 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.881111s, mse: 0.00411024
107
+ [2026-01-08 19:57:13 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 3.875601s, mse: 0.00407672
108
+ [2026-01-08 19:57:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 3.878016s, mse: 0.00404750
109
+ [2026-01-08 19:57:20 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 3.881199s, mse: 0.00401742
110
+ [2026-01-08 19:57:24 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 3.882928s, mse: 0.00398090
111
+ [2026-01-08 19:57:28 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 3.878617s, mse: 0.00397130
112
+ [2026-01-08 19:57:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
113
+ [2026-01-08 19:57:29 root] (train_utils.py 108): INFO ========= Layer 3 =========
114
+ [2026-01-08 19:57:37 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 4.904515s, mse: 0.02308414
115
+ [2026-01-08 19:57:41 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.959069s, mse: 0.01333557
116
+ [2026-01-08 19:57:45 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 3.885673s, mse: 0.01099337
117
+ [2026-01-08 19:57:49 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 3.889621s, mse: 0.01028412
118
+ [2026-01-08 19:57:52 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 3.866672s, mse: 0.01000082
119
+ [2026-01-08 19:57:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 3.871443s, mse: 0.00980410
120
+ [2026-01-08 19:58:00 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 3.865910s, mse: 0.00969286
121
+ [2026-01-08 19:58:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 3.871522s, mse: 0.00956387
122
+ [2026-01-08 19:58:08 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 3.869807s, mse: 0.00946260
123
+ [2026-01-08 19:58:12 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 3.870852s, mse: 0.00937346
124
+ [2026-01-08 19:58:16 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 3.881036s, mse: 0.00926330
125
+ [2026-01-08 19:58:20 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 3.865621s, mse: 0.00916464
126
+ [2026-01-08 19:58:23 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.870549s, mse: 0.00907166
127
+ [2026-01-08 19:58:27 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.869347s, mse: 0.00904066
128
+ [2026-01-08 19:58:31 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.869239s, mse: 0.00900416
129
+ [2026-01-08 19:58:32 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
130
+ [2026-01-08 19:58:33 root] (train_utils.py 108): INFO ========= Layer 4 =========
131
+ [2026-01-08 19:58:41 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 5.550916s, mse: 0.06576648
132
+ [2026-01-08 19:58:45 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 3.959090s, mse: 0.03741666
133
+ [2026-01-08 19:58:49 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 3.878123s, mse: 0.03053248
134
+ [2026-01-08 19:58:53 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 3.880192s, mse: 0.02855516
135
+ [2026-01-08 19:58:57 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 3.880899s, mse: 0.02790034
136
+ [2026-01-08 19:59:00 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 3.881101s, mse: 0.02746365
137
+ [2026-01-08 19:59:04 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.882933s, mse: 0.02716962
138
+ [2026-01-08 19:59:08 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.879195s, mse: 0.02687641
139
+ [2026-01-08 19:59:12 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.876591s, mse: 0.02662238
140
+ [2026-01-08 19:59:16 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 3.891409s, mse: 0.02643147
141
+ [2026-01-08 19:59:20 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 3.891485s, mse: 0.02624781
142
+ [2026-01-08 19:59:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 3.898774s, mse: 0.02604026
143
+ [2026-01-08 19:59:28 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 3.888591s, mse: 0.02585863
144
+ [2026-01-08 19:59:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 3.887119s, mse: 0.02578292
145
+ [2026-01-08 19:59:35 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 3.895279s, mse: 0.02572995
146
+ [2026-01-08 19:59:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
147
+ [2026-01-08 19:59:37 root] (train_utils.py 108): INFO ========= Layer 5 =========
148
+ [2026-01-08 19:59:45 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.898441s, mse: 0.13743916
149
+ [2026-01-08 19:59:49 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 3.961223s, mse: 0.08057592
150
+ [2026-01-08 19:59:53 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 3.889991s, mse: 0.06617787
151
+ [2026-01-08 19:59:57 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 3.895876s, mse: 0.06287611
152
+ [2026-01-08 20:00:01 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 3.896747s, mse: 0.06213523
153
+ [2026-01-08 20:00:05 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.888745s, mse: 0.06160403
154
+ [2026-01-08 20:00:09 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.896078s, mse: 0.06119698
155
+ [2026-01-08 20:00:13 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.890085s, mse: 0.06094177
156
+ [2026-01-08 20:00:16 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.884274s, mse: 0.06060794
157
+ [2026-01-08 20:00:20 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.900461s, mse: 0.06020888
158
+ [2026-01-08 20:00:24 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.890183s, mse: 0.05995716
159
+ [2026-01-08 20:00:28 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.900670s, mse: 0.05978661
160
+ [2026-01-08 20:00:32 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.893111s, mse: 0.05955682
161
+ [2026-01-08 20:00:36 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.897518s, mse: 0.05938030
162
+ [2026-01-08 20:00:40 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.884297s, mse: 0.05934311
163
+ [2026-01-08 20:00:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
164
+ [2026-01-08 20:00:53 root] (train_utils.py 108): INFO ========= Layer 6 =========
165
+ [2026-01-08 20:01:02 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 5.689345s, mse: 1.86451793
166
+ [2026-01-08 20:01:06 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.927325s, mse: 0.35658583
167
+ [2026-01-08 20:01:10 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.878829s, mse: 0.32737118
168
+ [2026-01-08 20:01:14 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.883708s, mse: 0.28929594
169
+ [2026-01-08 20:01:18 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.987442s, mse: 0.24128482
170
+ [2026-01-08 20:01:21 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.916989s, mse: 0.21027605
171
+ [2026-01-08 20:01:25 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.879291s, mse: 0.25483868
172
+ [2026-01-08 20:01:29 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.888053s, mse: 0.23871142
173
+ [2026-01-08 20:01:33 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.881018s, mse: 0.21885920
174
+ [2026-01-08 20:01:37 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.884074s, mse: 0.20672695
175
+ [2026-01-08 20:01:41 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.881378s, mse: 0.20202750
176
+ [2026-01-08 20:01:45 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.877917s, mse: 0.17932597
177
+ [2026-01-08 20:01:49 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.878248s, mse: 0.20257902
178
+ [2026-01-08 20:01:53 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.896357s, mse: 0.20667967
179
+ [2026-01-08 20:01:56 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.885994s, mse: 0.16777667
180
+ [2026-01-08 20:01:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
181
+ [2026-01-08 20:01:57 root] (train_utils.py 108): INFO ========= Layer 7 =========
182
+ [2026-01-08 20:02:05 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 5.306338s, mse: 0.23462753
183
+ [2026-01-08 20:02:09 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.972694s, mse: 0.14976017
184
+ [2026-01-08 20:02:13 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.913413s, mse: 0.12312289
185
+ [2026-01-08 20:02:17 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.904199s, mse: 0.11779824
186
+ [2026-01-08 20:02:21 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.902115s, mse: 0.11621600
187
+ [2026-01-08 20:02:25 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.879436s, mse: 0.11538153
188
+ [2026-01-08 20:02:29 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.873058s, mse: 0.11461711
189
+ [2026-01-08 20:02:33 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.880082s, mse: 0.11396322
190
+ [2026-01-08 20:02:36 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.880680s, mse: 0.11346199
191
+ [2026-01-08 20:02:40 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.874789s, mse: 0.11303829
192
+ [2026-01-08 20:02:44 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 3.937302s, mse: 0.11244514
193
+ [2026-01-08 20:02:48 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 3.881629s, mse: 0.11193727
194
+ [2026-01-08 20:02:52 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 3.877626s, mse: 0.11167257
195
+ [2026-01-08 20:02:56 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 3.881678s, mse: 0.11139309
196
+ [2026-01-08 20:03:00 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 3.873263s, mse: 0.11127126
197
+ [2026-01-08 20:03:00 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
198
+ [2026-01-08 20:03:01 root] (train_utils.py 108): INFO ========= Layer 8 =========
199
+ [2026-01-08 20:03:09 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 5.057469s, mse: 0.31783378
200
+ [2026-01-08 20:03:13 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 3.876861s, mse: 0.21154313
201
+ [2026-01-08 20:03:16 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 3.878272s, mse: 0.17556834
202
+ [2026-01-08 20:03:20 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 3.874820s, mse: 0.16892871
203
+ [2026-01-08 20:03:24 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 3.878854s, mse: 0.16700211
204
+ [2026-01-08 20:03:28 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 3.878486s, mse: 0.16594610
205
+ [2026-01-08 20:03:32 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 3.873478s, mse: 0.16510613
206
+ [2026-01-08 20:03:36 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 3.872782s, mse: 0.16456470
207
+ [2026-01-08 20:03:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 3.875328s, mse: 0.16401851
208
+ [2026-01-08 20:03:44 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 3.876778s, mse: 0.16352586
209
+ [2026-01-08 20:03:47 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 3.876492s, mse: 0.16331530
210
+ [2026-01-08 20:03:51 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 3.863941s, mse: 0.16285881
211
+ [2026-01-08 20:03:55 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 3.864256s, mse: 0.16254890
212
+ [2026-01-08 20:03:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 3.870399s, mse: 0.16240378
213
+ [2026-01-08 20:04:03 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 3.871459s, mse: 0.16246043
214
+ [2026-01-08 20:04:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
215
+ [2026-01-08 20:04:04 root] (train_utils.py 108): INFO ========= Layer 9 =========
216
+ [2026-01-08 20:04:12 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 5.024553s, mse: 0.37875688
217
+ [2026-01-08 20:04:16 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 3.871795s, mse: 0.25363240
218
+ [2026-01-08 20:04:20 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 3.871877s, mse: 0.21064380
219
+ [2026-01-08 20:04:23 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 3.864901s, mse: 0.20179385
220
+ [2026-01-08 20:04:27 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 3.868378s, mse: 0.19936548
221
+ [2026-01-08 20:04:31 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.870681s, mse: 0.19817175
222
+ [2026-01-08 20:04:35 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 3.872413s, mse: 0.19703594
223
+ [2026-01-08 20:04:39 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 3.879371s, mse: 0.19626960
224
+ [2026-01-08 20:04:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 3.875587s, mse: 0.19534998
225
+ [2026-01-08 20:04:47 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 3.874845s, mse: 0.19473058
226
+ [2026-01-08 20:04:51 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 3.881644s, mse: 0.19404019
227
+ [2026-01-08 20:04:54 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 3.886687s, mse: 0.19356999
228
+ [2026-01-08 20:04:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 4.398774s, mse: 0.19326007
229
+ [2026-01-08 20:05:03 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 3.888181s, mse: 0.19282311
230
+ [2026-01-08 20:05:07 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 3.880987s, mse: 0.19267595
231
+ [2026-01-08 20:05:07 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
232
+ [2026-01-08 20:05:08 root] (train_utils.py 108): INFO ========= Layer 10 =========
233
+ [2026-01-08 20:05:16 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 5.329155s, mse: 0.44592521
234
+ [2026-01-08 20:05:20 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.887850s, mse: 0.28058022
235
+ [2026-01-08 20:05:24 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.882091s, mse: 0.22870731
236
+ [2026-01-08 20:05:28 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 3.881134s, mse: 0.21672769
237
+ [2026-01-08 20:05:32 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 3.886322s, mse: 0.21354958
238
+ [2026-01-08 20:05:36 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 3.892071s, mse: 0.21149486
239
+ [2026-01-08 20:05:40 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 3.897464s, mse: 0.21045262
240
+ [2026-01-08 20:05:44 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 3.917081s, mse: 0.20926467
241
+ [2026-01-08 20:05:47 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 3.884677s, mse: 0.20823501
242
+ [2026-01-08 20:05:51 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.881175s, mse: 0.20746952
243
+ [2026-01-08 20:05:55 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.880876s, mse: 0.20690618
244
+ [2026-01-08 20:05:59 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.879183s, mse: 0.20613439
245
+ [2026-01-08 20:06:03 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.877125s, mse: 0.20562243
246
+ [2026-01-08 20:06:07 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 3.875461s, mse: 0.20517452
247
+ [2026-01-08 20:06:11 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 3.886986s, mse: 0.20504668
248
+ [2026-01-08 20:06:11 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
249
+ [2026-01-08 20:06:12 root] (train_utils.py 108): INFO ========= Layer 11 =========
250
+ [2026-01-08 20:06:20 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 5.241796s, mse: 0.39262417
251
+ [2026-01-08 20:06:24 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 3.955906s, mse: 0.27127978
252
+ [2026-01-08 20:06:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 3.879225s, mse: 0.22630122
253
+ [2026-01-08 20:06:31 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 3.883091s, mse: 0.21789221
254
+ [2026-01-08 20:06:35 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 3.883498s, mse: 0.21573043
255
+ [2026-01-08 20:06:39 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 3.888780s, mse: 0.21401882
256
+ [2026-01-08 20:06:43 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 3.883860s, mse: 0.21313243
257
+ [2026-01-08 20:06:47 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 3.908260s, mse: 0.21215978
258
+ [2026-01-08 20:06:51 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 3.875031s, mse: 0.21121168
259
+ [2026-01-08 20:06:55 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.878481s, mse: 0.21032479
260
+ [2026-01-08 20:06:59 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.885566s, mse: 0.20987187
261
+ [2026-01-08 20:07:03 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.890397s, mse: 0.20908046
262
+ [2026-01-08 20:07:06 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 3.880580s, mse: 0.20848191
263
+ [2026-01-08 20:07:10 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 3.883235s, mse: 0.20800886
264
+ [2026-01-08 20:07:14 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 3.877982s, mse: 0.20795538
265
+ [2026-01-08 20:07:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
266
+ [2026-01-08 20:07:15 root] (train_utils.py 108): INFO ========= Layer 12 =========
267
+ [2026-01-08 20:07:23 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 5.431174s, mse: 0.43535280
268
+ [2026-01-08 20:07:27 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 3.955285s, mse: 0.29579335
269
+ [2026-01-08 20:07:31 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 3.883549s, mse: 0.24488190
270
+ [2026-01-08 20:07:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 3.886171s, mse: 0.23438135
271
+ [2026-01-08 20:07:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 3.883848s, mse: 0.23133603
272
+ [2026-01-08 20:07:42 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 3.884988s, mse: 0.22933656
273
+ [2026-01-08 20:07:46 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 3.878419s, mse: 0.22804067
274
+ [2026-01-08 20:07:50 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 3.877913s, mse: 0.22690852
275
+ [2026-01-08 20:07:54 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.882433s, mse: 0.22579126
276
+ [2026-01-08 20:07:58 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.878513s, mse: 0.22475064
277
+ [2026-01-08 20:08:02 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.881245s, mse: 0.22366890
278
+ [2026-01-08 20:08:06 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.883585s, mse: 0.22277188
279
+ [2026-01-08 20:08:10 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.883076s, mse: 0.22196589
280
+ [2026-01-08 20:08:13 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.877839s, mse: 0.22144113
281
+ [2026-01-08 20:08:17 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.886441s, mse: 0.22116731
282
+ [2026-01-08 20:08:18 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
283
+ [2026-01-08 20:08:19 root] (train_utils.py 108): INFO ========= Layer 13 =========
284
+ [2026-01-08 20:08:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 5.372051s, mse: 0.44991863
285
+ [2026-01-08 20:08:30 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.877190s, mse: 0.30773303
286
+ [2026-01-08 20:08:34 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.877676s, mse: 0.25602528
287
+ [2026-01-08 20:08:38 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.887986s, mse: 0.24593170
288
+ [2026-01-08 20:08:42 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.881023s, mse: 0.24332635
289
+ [2026-01-08 20:08:46 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.883145s, mse: 0.24169515
290
+ [2026-01-08 20:08:50 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.875900s, mse: 0.24032030
291
+ [2026-01-08 20:08:54 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.880186s, mse: 0.23895445
292
+ [2026-01-08 20:08:58 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.879497s, mse: 0.23795472
293
+ [2026-01-08 20:09:01 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.878269s, mse: 0.23691620
294
+ [2026-01-08 20:09:05 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.893473s, mse: 0.23617835
295
+ [2026-01-08 20:09:09 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 3.880821s, mse: 0.23538260
296
+ [2026-01-08 20:09:13 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 3.883802s, mse: 0.23459788
297
+ [2026-01-08 20:09:17 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 3.885744s, mse: 0.23386008
298
+ [2026-01-08 20:09:21 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 3.877215s, mse: 0.23347831
299
+ [2026-01-08 20:09:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
300
+ [2026-01-08 20:09:22 root] (train_utils.py 108): INFO ========= Layer 14 =========
301
+ [2026-01-08 20:09:30 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 4.990215s, mse: 0.48670265
302
+ [2026-01-08 20:09:33 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.876076s, mse: 0.32924685
303
+ [2026-01-08 20:09:37 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 3.887783s, mse: 0.27174610
304
+ [2026-01-08 20:09:41 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 3.876861s, mse: 0.26111004
305
+ [2026-01-08 20:09:45 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 3.882962s, mse: 0.25857583
306
+ [2026-01-08 20:09:49 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 3.906216s, mse: 0.25724220
307
+ [2026-01-08 20:09:53 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 3.876176s, mse: 0.25530052
308
+ [2026-01-08 20:09:57 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 3.885303s, mse: 0.25373703
309
+ [2026-01-08 20:10:01 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.876806s, mse: 0.25232333
310
+ [2026-01-08 20:10:04 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 3.882736s, mse: 0.25103748
311
+ [2026-01-08 20:10:08 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 3.881382s, mse: 0.24987648
312
+ [2026-01-08 20:10:12 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 3.883449s, mse: 0.24912813
313
+ [2026-01-08 20:10:16 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 3.880871s, mse: 0.24813016
314
+ [2026-01-08 20:10:20 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 3.890360s, mse: 0.24762598
315
+ [2026-01-08 20:10:24 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.888802s, mse: 0.24739194
316
+ [2026-01-08 20:10:24 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
317
+ [2026-01-08 20:10:25 root] (train_utils.py 108): INFO ========= Layer 15 =========
318
+ [2026-01-08 20:10:33 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 4.986362s, mse: 0.48941827
319
+ [2026-01-08 20:10:37 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 3.880157s, mse: 0.32720220
320
+ [2026-01-08 20:10:41 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 3.880608s, mse: 0.26854873
321
+ [2026-01-08 20:10:44 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 3.880978s, mse: 0.25705975
322
+ [2026-01-08 20:10:48 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 3.878423s, mse: 0.25422159
323
+ [2026-01-08 20:10:52 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 3.891015s, mse: 0.25197345
324
+ [2026-01-08 20:10:56 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 3.883527s, mse: 0.25026903
325
+ [2026-01-08 20:11:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.874571s, mse: 0.24867499
326
+ [2026-01-08 20:11:04 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 3.883435s, mse: 0.24771519
327
+ [2026-01-08 20:11:08 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 3.882031s, mse: 0.24665023
328
+ [2026-01-08 20:11:12 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 3.881983s, mse: 0.24558856
329
+ [2026-01-08 20:11:15 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 3.877608s, mse: 0.24435455
330
+ [2026-01-08 20:11:19 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 3.889889s, mse: 0.24346027
331
+ [2026-01-08 20:11:23 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 3.883180s, mse: 0.24292424
332
+ [2026-01-08 20:11:27 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 3.884431s, mse: 0.24260354
333
+ [2026-01-08 20:11:28 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
334
+ [2026-01-08 20:11:28 root] (train_utils.py 108): INFO ========= Layer 16 =========
335
+ [2026-01-08 20:11:36 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 4.840796s, mse: 3.09758520
336
+ [2026-01-08 20:11:39 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.900541s, mse: 1.53681600
337
+ [2026-01-08 20:11:43 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.880463s, mse: 1.37538433
338
+ [2026-01-08 20:11:47 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.883470s, mse: 1.14041376
339
+ [2026-01-08 20:11:51 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 3.874584s, mse: 1.13041377
340
+ [2026-01-08 20:11:55 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 3.879631s, mse: 1.17505825
341
+ [2026-01-08 20:11:59 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 3.878177s, mse: 1.00187659
342
+ [2026-01-08 20:12:03 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 3.875633s, mse: 1.15916288
343
+ [2026-01-08 20:12:07 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 3.881597s, mse: 0.93556213
344
+ [2026-01-08 20:12:11 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 3.873534s, mse: 0.89307052
345
+ [2026-01-08 20:12:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.875691s, mse: 1.08854449
346
+ [2026-01-08 20:12:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.883201s, mse: 0.78587675
347
+ [2026-01-08 20:12:22 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.879601s, mse: 0.77024889
348
+ [2026-01-08 20:12:26 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.882570s, mse: 0.74143833
349
+ [2026-01-08 20:12:30 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.885281s, mse: 0.62904388
350
+ [2026-01-08 20:12:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
351
+ [2026-01-08 20:12:31 root] (train_utils.py 108): INFO ========= Layer 17 =========
352
+ [2026-01-08 20:12:39 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 5.544866s, mse: 0.57632238
353
+ [2026-01-08 20:12:43 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 3.915652s, mse: 0.38568184
354
+ [2026-01-08 20:12:47 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 3.885339s, mse: 0.30990756
355
+ [2026-01-08 20:12:51 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 3.893072s, mse: 0.29348093
356
+ [2026-01-08 20:12:55 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 3.887810s, mse: 0.28841209
357
+ [2026-01-08 20:12:59 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 3.886663s, mse: 0.28536177
358
+ [2026-01-08 20:13:02 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 3.891812s, mse: 0.28336507
359
+ [2026-01-08 20:13:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 3.885844s, mse: 0.28023016
360
+ [2026-01-08 20:13:10 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 3.883247s, mse: 0.27797151
361
+ [2026-01-08 20:13:14 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 3.882213s, mse: 0.27724716
362
+ [2026-01-08 20:13:18 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.885559s, mse: 0.27549568
363
+ [2026-01-08 20:13:22 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.885018s, mse: 0.27411795
364
+ [2026-01-08 20:13:26 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.889595s, mse: 0.27230272
365
+ [2026-01-08 20:13:30 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.888286s, mse: 0.27161792
366
+ [2026-01-08 20:13:34 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 3.888104s, mse: 0.27142629
367
+ [2026-01-08 20:13:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
368
+ [2026-01-08 20:13:34 root] (train_utils.py 108): INFO ========= Layer 18 =========
369
+ [2026-01-08 20:13:42 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 4.824618s, mse: 0.68219566
370
+ [2026-01-08 20:13:46 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 3.956999s, mse: 0.44933167
371
+ [2026-01-08 20:13:50 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 4.207351s, mse: 0.36149144
372
+ [2026-01-08 20:13:54 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 4.268250s, mse: 0.34437451
373
+ [2026-01-08 20:13:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 4.559022s, mse: 0.33928376
374
+ [2026-01-08 20:14:02 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 3.898552s, mse: 0.33628541
375
+ [2026-01-08 20:14:06 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 3.896710s, mse: 0.33380261
376
+ [2026-01-08 20:14:10 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 3.873617s, mse: 0.33132178
377
+ [2026-01-08 20:14:14 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 3.874515s, mse: 0.32943395
378
+ [2026-01-08 20:14:18 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 3.881486s, mse: 0.32786560
379
+ [2026-01-08 20:14:22 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.874159s, mse: 0.32583937
380
+ [2026-01-08 20:14:26 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.872985s, mse: 0.32450172
381
+ [2026-01-08 20:14:30 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.881658s, mse: 0.32264820
382
+ [2026-01-08 20:14:34 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.877914s, mse: 0.32187557
383
+ [2026-01-08 20:14:37 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.878186s, mse: 0.32105669
384
+ [2026-01-08 20:14:38 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
385
+ [2026-01-08 20:14:38 root] (train_utils.py 108): INFO ========= Layer 19 =========
386
+ [2026-01-08 20:14:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 4.583254s, mse: 0.88728219
387
+ [2026-01-08 20:14:49 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 3.886281s, mse: 0.57078516
388
+ [2026-01-08 20:14:53 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 3.872338s, mse: 0.45792666
389
+ [2026-01-08 20:14:57 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.876560s, mse: 0.43537480
390
+ [2026-01-08 20:15:01 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.874999s, mse: 0.42894897
391
+ [2026-01-08 20:15:05 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.882475s, mse: 0.42462113
392
+ [2026-01-08 20:15:09 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.877281s, mse: 0.42157629
393
+ [2026-01-08 20:15:13 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.878093s, mse: 0.41864219
394
+ [2026-01-08 20:15:16 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.882066s, mse: 0.41570342
395
+ [2026-01-08 20:15:20 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.872604s, mse: 0.41345572
396
+ [2026-01-08 20:15:24 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.875411s, mse: 0.41054672
397
+ [2026-01-08 20:15:28 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.879127s, mse: 0.40846488
398
+ [2026-01-08 20:15:32 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.876599s, mse: 0.40727249
399
+ [2026-01-08 20:15:36 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.881322s, mse: 0.40628025
400
+ [2026-01-08 20:15:40 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 3.875526s, mse: 0.40573606
401
+ [2026-01-08 20:15:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
402
+ [2026-01-08 20:15:41 root] (train_utils.py 108): INFO ========= Layer 20 =========
403
+ [2026-01-08 20:15:49 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 4.995933s, mse: 0.88836050
404
+ [2026-01-08 20:15:53 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 3.873272s, mse: 0.59483135
405
+ [2026-01-08 20:15:56 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 3.875340s, mse: 0.48579982
406
+ [2026-01-08 20:16:00 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.876288s, mse: 0.46583182
407
+ [2026-01-08 20:16:04 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 3.881600s, mse: 0.46044937
408
+ [2026-01-08 20:16:08 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 3.869276s, mse: 0.45749170
409
+ [2026-01-08 20:16:12 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 3.882871s, mse: 0.45316568
410
+ [2026-01-08 20:16:16 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 3.873422s, mse: 0.45053339
411
+ [2026-01-08 20:16:20 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 3.871368s, mse: 0.44832462
412
+ [2026-01-08 20:16:24 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 3.892880s, mse: 0.44616416
413
+ [2026-01-08 20:16:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.879390s, mse: 0.44334349
414
+ [2026-01-08 20:16:31 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 3.874708s, mse: 0.44204527
415
+ [2026-01-08 20:16:35 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 3.879157s, mse: 0.43987796
416
+ [2026-01-08 20:16:39 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 3.923136s, mse: 0.43863490
417
+ [2026-01-08 20:16:43 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 3.887576s, mse: 0.43791217
418
+ [2026-01-08 20:16:43 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
419
+ [2026-01-08 20:16:44 root] (train_utils.py 108): INFO ========= Layer 21 =========
420
+ [2026-01-08 20:16:52 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.168227s, mse: 1.18043423
421
+ [2026-01-08 20:16:56 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.969011s, mse: 0.77954561
422
+ [2026-01-08 20:17:00 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 3.886918s, mse: 0.64111829
423
+ [2026-01-08 20:17:04 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 3.872880s, mse: 0.61397409
424
+ [2026-01-08 20:17:08 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 3.885565s, mse: 0.60631013
425
+ [2026-01-08 20:17:11 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 3.888389s, mse: 0.60047567
426
+ [2026-01-08 20:17:15 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 3.884962s, mse: 0.59512597
427
+ [2026-01-08 20:17:19 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 3.891089s, mse: 0.59215677
428
+ [2026-01-08 20:17:23 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 3.884314s, mse: 0.58796024
429
+ [2026-01-08 20:17:27 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 3.872022s, mse: 0.58513182
430
+ [2026-01-08 20:17:31 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.889990s, mse: 0.58225924
431
+ [2026-01-08 20:17:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 3.887443s, mse: 0.57988369
432
+ [2026-01-08 20:17:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 3.889952s, mse: 0.57718277
433
+ [2026-01-08 20:17:43 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 3.880646s, mse: 0.57546204
434
+ [2026-01-08 20:17:46 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 3.886966s, mse: 0.57469940
435
+ [2026-01-08 20:17:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
436
+ [2026-01-08 20:17:48 root] (train_utils.py 108): INFO ========= Layer 22 =========
437
+ [2026-01-08 20:17:55 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.094079s, mse: 1.88664389
438
+ [2026-01-08 20:17:59 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 3.885375s, mse: 1.18959606
439
+ [2026-01-08 20:18:03 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 3.886684s, mse: 0.95907360
440
+ [2026-01-08 20:18:07 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.882739s, mse: 0.91428280
441
+ [2026-01-08 20:18:11 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.883721s, mse: 0.90376323
442
+ [2026-01-08 20:18:15 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.892156s, mse: 0.89363086
443
+ [2026-01-08 20:18:19 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 3.886229s, mse: 0.88751125
444
+ [2026-01-08 20:18:23 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 3.886792s, mse: 0.87932986
445
+ [2026-01-08 20:18:26 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 3.887209s, mse: 0.87506205
446
+ [2026-01-08 20:18:30 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 3.894486s, mse: 0.86960399
447
+ [2026-01-08 20:18:34 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 3.895200s, mse: 0.86433518
448
+ [2026-01-08 20:18:38 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 3.897787s, mse: 0.85831034
449
+ [2026-01-08 20:18:42 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.895077s, mse: 0.85434479
450
+ [2026-01-08 20:18:46 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.885206s, mse: 0.85274106
451
+ [2026-01-08 20:18:50 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.900901s, mse: 0.85105854
452
+ [2026-01-08 20:18:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
453
+ [2026-01-08 20:18:51 root] (train_utils.py 108): INFO ========= Layer 23 =========
454
+ [2026-01-08 20:18:59 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 5.249717s, mse: 2.56160784
455
+ [2026-01-08 20:19:03 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 3.885572s, mse: 1.69400561
456
+ [2026-01-08 20:19:07 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 3.884498s, mse: 1.40092814
457
+ [2026-01-08 20:19:11 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 3.870114s, mse: 1.33960748
458
+ [2026-01-08 20:19:14 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 3.876960s, mse: 1.31923652
459
+ [2026-01-08 20:19:18 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 3.886075s, mse: 1.30260742
460
+ [2026-01-08 20:19:22 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 3.886401s, mse: 1.29341400
461
+ [2026-01-08 20:19:26 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 3.880126s, mse: 1.28473794
462
+ [2026-01-08 20:19:30 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 3.874241s, mse: 1.27725101
463
+ [2026-01-08 20:19:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 3.873134s, mse: 1.27071691
464
+ [2026-01-08 20:19:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 3.887421s, mse: 1.26552820
465
+ [2026-01-08 20:19:42 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.872315s, mse: 1.26018000
466
+ [2026-01-08 20:19:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.885125s, mse: 1.25696874
467
+ [2026-01-08 20:19:49 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.875618s, mse: 1.25348544
468
+ [2026-01-08 20:19:53 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 3.879704s, mse: 1.25113153
469
+ [2026-01-08 20:19:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
470
+ [2026-01-08 20:19:55 root] (train_utils.py 108): INFO ========= Layer 24 =========
471
+ [2026-01-08 20:20:02 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 5.248398s, mse: 3.33080626
472
+ [2026-01-08 20:20:06 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 3.880547s, mse: 2.21739531
473
+ [2026-01-08 20:20:10 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 3.877884s, mse: 1.83558488
474
+ [2026-01-08 20:20:14 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 3.877035s, mse: 1.75192118
475
+ [2026-01-08 20:20:18 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 3.878414s, mse: 1.73021388
476
+ [2026-01-08 20:20:22 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 3.876617s, mse: 1.70965135
477
+ [2026-01-08 20:20:26 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 3.885743s, mse: 1.69753647
478
+ [2026-01-08 20:20:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 3.880893s, mse: 1.68364048
479
+ [2026-01-08 20:20:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 3.879582s, mse: 1.67123342
480
+ [2026-01-08 20:20:37 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 3.884786s, mse: 1.66224420
481
+ [2026-01-08 20:20:41 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 3.881520s, mse: 1.65476453
482
+ [2026-01-08 20:20:45 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.883099s, mse: 1.64498436
483
+ [2026-01-08 20:20:49 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 4.338018s, mse: 1.63647079
484
+ [2026-01-08 20:20:54 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 4.411990s, mse: 1.63291585
485
+ [2026-01-08 20:20:58 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 4.227370s, mse: 1.63007939
486
+ [2026-01-08 20:20:59 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
487
+ [2026-01-08 20:21:00 root] (train_utils.py 108): INFO ========= Layer 25 =========
488
+ [2026-01-08 20:21:07 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 4.700488s, mse: 3.67945337
489
+ [2026-01-08 20:21:11 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.908360s, mse: 2.39840055
490
+ [2026-01-08 20:21:15 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.906975s, mse: 2.00158238
491
+ [2026-01-08 20:21:19 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.874880s, mse: 1.92655563
492
+ [2026-01-08 20:21:22 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.876819s, mse: 1.90741169
493
+ [2026-01-08 20:21:26 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.876056s, mse: 1.89064825
494
+ [2026-01-08 20:21:30 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.879740s, mse: 1.88254857
495
+ [2026-01-08 20:21:34 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.875947s, mse: 1.87189174
496
+ [2026-01-08 20:21:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 3.891793s, mse: 1.86226833
497
+ [2026-01-08 20:21:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.879055s, mse: 1.85414529
498
+ [2026-01-08 20:21:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.878680s, mse: 1.84632003
499
+ [2026-01-08 20:21:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.879498s, mse: 1.83962476
500
+ [2026-01-08 20:21:53 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.880664s, mse: 1.83272731
501
+ [2026-01-08 20:21:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 4.080587s, mse: 1.83188641
502
+ [2026-01-08 20:22:01 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 3.874877s, mse: 1.82856822
503
+ [2026-01-08 20:22:02 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
504
+ [2026-01-08 20:22:03 root] (train_utils.py 108): INFO ========= Layer 26 =========
505
+ [2026-01-08 20:22:11 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.800936s, mse: 4.35819054
506
+ [2026-01-08 20:22:15 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 3.951150s, mse: 2.94494462
507
+ [2026-01-08 20:22:19 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 3.874370s, mse: 2.46222878
508
+ [2026-01-08 20:22:23 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 3.879827s, mse: 2.36697221
509
+ [2026-01-08 20:22:27 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.873295s, mse: 2.34871936
510
+ [2026-01-08 20:22:31 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 3.881480s, mse: 2.33013940
511
+ [2026-01-08 20:22:34 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 3.875904s, mse: 2.31725478
512
+ [2026-01-08 20:22:38 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 3.879355s, mse: 2.30295658
513
+ [2026-01-08 20:22:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 3.883507s, mse: 2.29171467
514
+ [2026-01-08 20:22:46 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 3.878979s, mse: 2.28112888
515
+ [2026-01-08 20:22:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.874169s, mse: 2.27260423
516
+ [2026-01-08 20:22:54 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.872427s, mse: 2.26187754
517
+ [2026-01-08 20:22:58 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 3.878131s, mse: 2.25517917
518
+ [2026-01-08 20:23:02 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 3.877979s, mse: 2.24800634
519
+ [2026-01-08 20:23:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 3.879482s, mse: 2.24403787
520
+ [2026-01-08 20:23:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
521
+ [2026-01-08 20:23:07 root] (train_utils.py 108): INFO ========= Layer 27 =========
522
+ [2026-01-08 20:23:16 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 5.418776s, mse: 5.94560862
523
+ [2026-01-08 20:23:19 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.875828s, mse: 3.95834851
524
+ [2026-01-08 20:23:23 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 3.876054s, mse: 3.32281756
525
+ [2026-01-08 20:23:27 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 3.876422s, mse: 3.18086267
526
+ [2026-01-08 20:23:31 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 3.874758s, mse: 3.14467168
527
+ [2026-01-08 20:23:35 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 3.882139s, mse: 3.12000346
528
+ [2026-01-08 20:23:39 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 3.877449s, mse: 3.09776139
529
+ [2026-01-08 20:23:43 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 3.880479s, mse: 3.07834363
530
+ [2026-01-08 20:23:47 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 3.899203s, mse: 3.06277657
531
+ [2026-01-08 20:23:50 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 3.879466s, mse: 3.04591680
532
+ [2026-01-08 20:23:54 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.876263s, mse: 3.03134632
533
+ [2026-01-08 20:23:58 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 3.881885s, mse: 3.01916480
534
+ [2026-01-08 20:24:02 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 3.881700s, mse: 3.00719571
535
+ [2026-01-08 20:24:06 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 3.874153s, mse: 2.99984956
536
+ [2026-01-08 20:24:10 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 3.875395s, mse: 2.99120903
537
+ [2026-01-08 20:24:10 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
538
+ [2026-01-08 20:24:11 root] (train_utils.py 108): INFO ========= Layer 28 =========
539
+ [2026-01-08 20:24:19 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 4.966117s, mse: 8.40579605
540
+ [2026-01-08 20:24:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 3.873255s, mse: 5.55529737
541
+ [2026-01-08 20:24:26 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 3.885448s, mse: 4.64479589
542
+ [2026-01-08 20:24:30 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 3.878376s, mse: 4.46341419
543
+ [2026-01-08 20:24:34 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.879312s, mse: 4.40386772
544
+ [2026-01-08 20:24:38 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.882168s, mse: 4.37245226
545
+ [2026-01-08 20:24:42 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 3.881505s, mse: 4.34240580
546
+ [2026-01-08 20:24:46 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 3.881425s, mse: 4.31763363
547
+ [2026-01-08 20:24:50 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 3.878464s, mse: 4.29854107
548
+ [2026-01-08 20:24:54 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 3.876505s, mse: 4.28071547
549
+ [2026-01-08 20:24:57 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 3.881535s, mse: 4.26679897
550
+ [2026-01-08 20:25:01 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 3.877203s, mse: 4.24268007
551
+ [2026-01-08 20:25:05 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 3.876781s, mse: 4.22641373
552
+ [2026-01-08 20:25:09 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.874362s, mse: 4.22128248
553
+ [2026-01-08 20:25:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.876956s, mse: 4.21494389
554
+ [2026-01-08 20:25:13 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
555
+ [2026-01-08 20:25:14 root] (train_utils.py 108): INFO ========= Layer 29 =========
556
+ [2026-01-08 20:25:21 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 4.975133s, mse: 10.38746834
557
+ [2026-01-08 20:25:25 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 3.876715s, mse: 7.14648628
558
+ [2026-01-08 20:25:29 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 3.880992s, mse: 6.03318691
559
+ [2026-01-08 20:25:33 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 3.880460s, mse: 5.78764057
560
+ [2026-01-08 20:25:37 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 3.886153s, mse: 5.71550655
561
+ [2026-01-08 20:25:41 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 3.883043s, mse: 5.66473246
562
+ [2026-01-08 20:25:45 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 3.884337s, mse: 5.61916113
563
+ [2026-01-08 20:25:49 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 3.878447s, mse: 5.58458805
564
+ [2026-01-08 20:25:52 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 3.873308s, mse: 5.54784393
565
+ [2026-01-08 20:25:56 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 3.882202s, mse: 5.52231646
566
+ [2026-01-08 20:26:00 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 3.881016s, mse: 5.48976994
567
+ [2026-01-08 20:26:04 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.876146s, mse: 5.46507311
568
+ [2026-01-08 20:26:08 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.877000s, mse: 5.44575977
569
+ [2026-01-08 20:26:12 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 3.881991s, mse: 5.43577242
570
+ [2026-01-08 20:26:16 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 3.874315s, mse: 5.42604542
571
+ [2026-01-08 20:26:16 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
572
+ [2026-01-08 20:26:17 root] (train_utils.py 108): INFO ========= Layer 30 =========
573
+ [2026-01-08 20:26:25 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 5.156940s, mse: 16.29405975
574
+ [2026-01-08 20:26:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 3.891409s, mse: 11.01632500
575
+ [2026-01-08 20:26:32 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 3.874850s, mse: 9.27882481
576
+ [2026-01-08 20:26:36 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 3.885643s, mse: 8.87542439
577
+ [2026-01-08 20:26:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 3.875562s, mse: 8.75351048
578
+ [2026-01-08 20:26:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 3.872500s, mse: 8.65880680
579
+ [2026-01-08 20:26:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 3.884465s, mse: 8.60634327
580
+ [2026-01-08 20:26:52 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 3.870189s, mse: 8.53597736
581
+ [2026-01-08 20:26:56 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 3.875866s, mse: 8.50352001
582
+ [2026-01-08 20:27:00 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 3.874484s, mse: 8.44190311
583
+ [2026-01-08 20:27:03 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.877156s, mse: 8.40491486
584
+ [2026-01-08 20:27:07 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.874284s, mse: 8.38511753
585
+ [2026-01-08 20:27:11 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.874029s, mse: 8.35692787
586
+ [2026-01-08 20:27:15 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.880367s, mse: 8.35674667
587
+ [2026-01-08 20:27:19 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.877804s, mse: 8.34408569
588
+ [2026-01-08 20:27:19 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
589
+ [2026-01-08 20:27:20 root] (train_utils.py 108): INFO ========= Layer 31 =========
590
+ [2026-01-08 20:27:29 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 5.676542s, mse: 20.78250885
591
+ [2026-01-08 20:27:33 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 3.947701s, mse: 14.37235165
592
+ [2026-01-08 20:27:36 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.875565s, mse: 12.13233566
593
+ [2026-01-08 20:27:40 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.872178s, mse: 11.62570667
594
+ [2026-01-08 20:27:44 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.876312s, mse: 11.51362991
595
+ [2026-01-08 20:27:48 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.880129s, mse: 11.42485142
596
+ [2026-01-08 20:27:52 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.876696s, mse: 11.33607769
597
+ [2026-01-08 20:27:56 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.879226s, mse: 11.27843571
598
+ [2026-01-08 20:28:00 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.872689s, mse: 11.22037888
599
+ [2026-01-08 20:28:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.875676s, mse: 11.15839195
600
+ [2026-01-08 20:28:07 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.871733s, mse: 11.12734127
601
+ [2026-01-08 20:28:11 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.876626s, mse: 11.08810806
602
+ [2026-01-08 20:28:15 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.878724s, mse: 11.05513668
603
+ [2026-01-08 20:28:19 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 3.879174s, mse: 11.03436947
604
+ [2026-01-08 20:28:23 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 3.875401s, mse: 11.01393795
605
+ [2026-01-08 20:28:23 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
606
+ [2026-01-08 20:28:24 root] (train_utils.py 108): INFO ========= Layer 32 =========
607
+ [2026-01-08 20:28:32 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.014384s, mse: 28.37956429
608
+ [2026-01-08 20:28:36 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 3.874096s, mse: 19.76789856
609
+ [2026-01-08 20:28:39 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 3.879622s, mse: 16.61169624
610
+ [2026-01-08 20:28:43 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.878350s, mse: 15.88970184
611
+ [2026-01-08 20:28:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 3.873660s, mse: 15.74769402
612
+ [2026-01-08 20:28:51 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 3.872789s, mse: 15.61922455
613
+ [2026-01-08 20:28:55 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 3.873571s, mse: 15.51004982
614
+ [2026-01-08 20:28:59 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 3.873817s, mse: 15.42904854
615
+ [2026-01-08 20:29:03 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 3.879045s, mse: 15.34880447
616
+ [2026-01-08 20:29:07 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.876587s, mse: 15.27359772
617
+ [2026-01-08 20:29:10 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.876626s, mse: 15.21441174
618
+ [2026-01-08 20:29:14 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 3.873835s, mse: 15.16252708
619
+ [2026-01-08 20:29:18 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 3.877970s, mse: 15.10843849
620
+ [2026-01-08 20:29:22 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 3.876159s, mse: 15.08382893
621
+ [2026-01-08 20:29:26 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 3.866788s, mse: 15.06546974
622
+ [2026-01-08 20:29:26 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
623
+ [2026-01-08 20:29:27 root] (train_utils.py 108): INFO ========= Layer 33 =========
624
+ [2026-01-08 20:29:35 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 5.076409s, mse: 41.54327011
625
+ [2026-01-08 20:29:39 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 3.883298s, mse: 27.93664551
626
+ [2026-01-08 20:29:43 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 3.877707s, mse: 23.32941628
627
+ [2026-01-08 20:29:47 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 3.931390s, mse: 22.34293175
628
+ [2026-01-08 20:29:51 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 3.876992s, mse: 22.07669640
629
+ [2026-01-08 20:29:54 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 3.892163s, mse: 21.87960243
630
+ [2026-01-08 20:29:58 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 3.882131s, mse: 21.73635674
631
+ [2026-01-08 20:30:02 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 3.886717s, mse: 21.58724403
632
+ [2026-01-08 20:30:06 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 3.884485s, mse: 21.46766853
633
+ [2026-01-08 20:30:10 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.888596s, mse: 21.36098099
634
+ [2026-01-08 20:30:14 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 3.884719s, mse: 21.27636719
635
+ [2026-01-08 20:30:18 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 3.906380s, mse: 21.16030693
636
+ [2026-01-08 20:30:22 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 3.878760s, mse: 21.07536125
637
+ [2026-01-08 20:30:26 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 3.876303s, mse: 20.99114990
638
+ [2026-01-08 20:30:29 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 3.879338s, mse: 20.95961761
639
+ [2026-01-08 20:30:30 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
640
+ [2026-01-08 20:30:31 root] (train_utils.py 108): INFO ========= Layer 34 =========
641
+ [2026-01-08 20:30:38 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 5.304806s, mse: 64.93594360
642
+ [2026-01-08 20:30:42 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 3.879078s, mse: 40.86461258
643
+ [2026-01-08 20:30:46 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 3.877936s, mse: 33.65349960
644
+ [2026-01-08 20:30:50 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.873914s, mse: 31.96302605
645
+ [2026-01-08 20:30:54 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.869976s, mse: 31.66926384
646
+ [2026-01-08 20:30:58 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 3.870930s, mse: 31.07656479
647
+ [2026-01-08 20:31:02 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 3.873630s, mse: 30.91048813
648
+ [2026-01-08 20:31:06 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 3.874344s, mse: 30.05115700
649
+ [2026-01-08 20:31:09 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 3.875742s, mse: 29.89023590
650
+ [2026-01-08 20:31:13 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 3.876966s, mse: 30.35319901
651
+ [2026-01-08 20:31:17 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 3.879142s, mse: 29.46559715
652
+ [2026-01-08 20:31:21 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 3.871687s, mse: 29.05239487
653
+ [2026-01-08 20:31:25 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.872833s, mse: 28.86521339
654
+ [2026-01-08 20:31:29 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.881875s, mse: 28.74409676
655
+ [2026-01-08 20:31:33 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 3.879118s, mse: 28.70412636
656
+ [2026-01-08 20:31:33 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
657
+ [2026-01-08 20:31:34 root] (train_utils.py 108): INFO ========= Layer 35 =========
658
+ [2026-01-08 20:31:42 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 5.142302s, mse: 108.25781250
659
+ [2026-01-08 20:31:46 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 3.879463s, mse: 38.04971313
660
+ [2026-01-08 20:31:49 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 3.874327s, mse: 31.63025665
661
+ [2026-01-08 20:31:53 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 3.874362s, mse: 29.21376991
662
+ [2026-01-08 20:31:57 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 3.884794s, mse: 28.19089508
663
+ [2026-01-08 20:32:01 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 3.873233s, mse: 28.40728760
664
+ [2026-01-08 20:32:05 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 3.876803s, mse: 27.74842644
665
+ [2026-01-08 20:32:09 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 3.873011s, mse: 27.13273811
666
+ [2026-01-08 20:32:13 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 3.873823s, mse: 26.53238487
667
+ [2026-01-08 20:32:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 3.874454s, mse: 26.14052200
668
+ [2026-01-08 20:32:20 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.885984s, mse: 25.63203621
669
+ [2026-01-08 20:32:24 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.874369s, mse: 25.35079384
670
+ [2026-01-08 20:32:28 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 3.883839s, mse: 25.21109390
671
+ [2026-01-08 20:32:32 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 3.875901s, mse: 24.95710945
672
+ [2026-01-08 20:32:36 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 3.888915s, mse: 24.85692596
673
+ [2026-01-08 20:32:36 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
674
+ [2026-01-08 20:33:19 root] (main.py 39): INFO Finished reparameterize model.
675
+ [2026-01-08 20:33:46 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
676
+ [2026-01-08 20:34:11 root] (flat_utils.py 204): INFO saved weights at ./outputs/Qwen3-8B/w4a4/exp
677
+ [2026-01-08 20:34:24 root] (main.py 60): INFO wikitext2
678
+ [2026-01-08 20:35:05 root] (main.py 69): INFO 10.271322250366211
679
+ [2026-01-08 20:35:05 root] (main.py 60): INFO c4
680
+ [2026-01-08 20:36:14 root] (main.py 69): INFO 16.169748306274414
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260109_092702.txt ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-09 09:27:02 root] (args_utils.py 168): INFO Arguments:
2
+ [2026-01-09 09:27:02 root] (args_utils.py 169): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': True,
40
+ 'reload_matrix': False,
41
+ 'resume': False,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-09 09:27:02 root] (args_utils.py 170): INFO ------------------------------------------------------------
59
+ [2026-01-09 09:27:03 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-09 09:27:19 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-09 09:27:24 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-09 09:27:26 root] (train_utils.py 108): INFO ========= Layer 0 =========
63
+ [2026-01-09 09:27:34 root] (train_utils.py 185): INFO layer 0 lwc lac iter 0, lr 0.00494542 time 5.354077s, mse: 0.01574295
64
+ [2026-01-09 09:27:38 root] (train_utils.py 185): INFO layer 0 lwc lac iter 1, lr 0.00478408 time 3.918327s, mse: 0.01115426
65
+ [2026-01-09 09:27:42 root] (train_utils.py 185): INFO layer 0 lwc lac iter 2, lr 0.00452302 time 3.884766s, mse: 0.00938093
66
+ [2026-01-09 09:27:46 root] (train_utils.py 185): INFO layer 0 lwc lac iter 3, lr 0.00417365 time 3.885943s, mse: 0.00881439
67
+ [2026-01-09 09:27:50 root] (train_utils.py 185): INFO layer 0 lwc lac iter 4, lr 0.00375125 time 3.880416s, mse: 0.00857142
68
+ [2026-01-09 09:27:53 root] (train_utils.py 185): INFO layer 0 lwc lac iter 5, lr 0.00327427 time 3.886740s, mse: 0.00849318
69
+ [2026-01-09 09:27:57 root] (train_utils.py 185): INFO layer 0 lwc lac iter 6, lr 0.00276356 time 3.884636s, mse: 0.00832680
70
+ [2026-01-09 09:28:01 root] (train_utils.py 185): INFO layer 0 lwc lac iter 7, lr 0.00224144 time 3.883160s, mse: 0.00828776
71
+ [2026-01-09 09:28:05 root] (train_utils.py 185): INFO layer 0 lwc lac iter 8, lr 0.00173073 time 3.903839s, mse: 0.00818714
72
+ [2026-01-09 09:28:09 root] (train_utils.py 185): INFO layer 0 lwc lac iter 9, lr 0.00125375 time 3.949619s, mse: 0.00813103
73
+ [2026-01-09 09:28:13 root] (train_utils.py 185): INFO layer 0 lwc lac iter 10, lr 0.00083135 time 3.970162s, mse: 0.00808381
74
+ [2026-01-09 09:28:17 root] (train_utils.py 185): INFO layer 0 lwc lac iter 11, lr 0.00048198 time 3.953839s, mse: 0.00804329
75
+ [2026-01-09 09:28:21 root] (train_utils.py 185): INFO layer 0 lwc lac iter 12, lr 0.00022092 time 3.935800s, mse: 0.00799941
76
+ [2026-01-09 09:28:25 root] (train_utils.py 185): INFO layer 0 lwc lac iter 13, lr 0.00005958 time 3.936768s, mse: 0.00795571
77
+ [2026-01-09 09:28:29 root] (train_utils.py 185): INFO layer 0 lwc lac iter 14, lr 0.00000500 time 3.936058s, mse: 0.00794016
78
+ [2026-01-09 09:28:29 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
79
+ [2026-01-09 09:28:30 root] (train_utils.py 108): INFO ========= Layer 1 =========
80
+ [2026-01-09 09:28:37 root] (train_utils.py 185): INFO layer 1 lwc lac iter 0, lr 0.00494542 time 4.692473s, mse: 0.00892038
81
+ [2026-01-09 09:28:41 root] (train_utils.py 185): INFO layer 1 lwc lac iter 1, lr 0.00478408 time 3.960003s, mse: 0.00479663
82
+ [2026-01-09 09:28:45 root] (train_utils.py 185): INFO layer 1 lwc lac iter 2, lr 0.00452302 time 3.980257s, mse: 0.00384854
83
+ [2026-01-09 09:28:49 root] (train_utils.py 185): INFO layer 1 lwc lac iter 3, lr 0.00417365 time 3.956233s, mse: 0.00355465
84
+ [2026-01-09 09:28:53 root] (train_utils.py 185): INFO layer 1 lwc lac iter 4, lr 0.00375125 time 3.955152s, mse: 0.00343135
85
+ [2026-01-09 09:28:57 root] (train_utils.py 185): INFO layer 1 lwc lac iter 5, lr 0.00327427 time 3.946210s, mse: 0.00337971
86
+ [2026-01-09 09:29:01 root] (train_utils.py 185): INFO layer 1 lwc lac iter 6, lr 0.00276356 time 3.939211s, mse: 0.00336636
87
+ [2026-01-09 09:29:05 root] (train_utils.py 185): INFO layer 1 lwc lac iter 7, lr 0.00224144 time 4.178168s, mse: 0.00329515
88
+ [2026-01-09 09:29:09 root] (train_utils.py 185): INFO layer 1 lwc lac iter 8, lr 0.00173073 time 4.400898s, mse: 0.00326379
89
+ [2026-01-09 09:29:14 root] (train_utils.py 185): INFO layer 1 lwc lac iter 9, lr 0.00125375 time 4.451300s, mse: 0.00321724
90
+ [2026-01-09 09:29:18 root] (train_utils.py 185): INFO layer 1 lwc lac iter 10, lr 0.00083135 time 4.386121s, mse: 0.00316591
91
+ [2026-01-09 09:29:22 root] (train_utils.py 185): INFO layer 1 lwc lac iter 11, lr 0.00048198 time 4.420070s, mse: 0.00313276
92
+ [2026-01-09 09:29:27 root] (train_utils.py 185): INFO layer 1 lwc lac iter 12, lr 0.00022092 time 4.264460s, mse: 0.00310469
93
+ [2026-01-09 09:29:31 root] (train_utils.py 185): INFO layer 1 lwc lac iter 13, lr 0.00005958 time 3.953069s, mse: 0.00308243
94
+ [2026-01-09 09:29:38 root] (train_utils.py 185): INFO layer 1 lwc lac iter 14, lr 0.00000500 time 7.505043s, mse: 0.00306749
95
+ [2026-01-09 09:29:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
96
+ [2026-01-09 09:29:39 root] (train_utils.py 108): INFO ========= Layer 2 =========
97
+ [2026-01-09 09:29:52 root] (train_utils.py 185): INFO layer 2 lwc lac iter 0, lr 0.00494542 time 8.480551s, mse: 0.01750460
98
+ [2026-01-09 09:29:59 root] (train_utils.py 185): INFO layer 2 lwc lac iter 1, lr 0.00478408 time 7.587432s, mse: 0.00626545
99
+ [2026-01-09 09:30:06 root] (train_utils.py 185): INFO layer 2 lwc lac iter 2, lr 0.00452302 time 6.716398s, mse: 0.00494380
100
+ [2026-01-09 09:30:10 root] (train_utils.py 185): INFO layer 2 lwc lac iter 3, lr 0.00417365 time 4.318102s, mse: 0.00453308
101
+ [2026-01-09 09:30:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 4, lr 0.00375125 time 7.168204s, mse: 0.00439964
102
+ [2026-01-09 09:30:25 root] (train_utils.py 185): INFO layer 2 lwc lac iter 5, lr 0.00327427 time 7.189687s, mse: 0.00429795
103
+ [2026-01-09 09:30:32 root] (train_utils.py 185): INFO layer 2 lwc lac iter 6, lr 0.00276356 time 7.178458s, mse: 0.00425246
104
+ [2026-01-09 09:30:39 root] (train_utils.py 185): INFO layer 2 lwc lac iter 7, lr 0.00224144 time 7.220179s, mse: 0.00420888
105
+ [2026-01-09 09:30:44 root] (train_utils.py 185): INFO layer 2 lwc lac iter 8, lr 0.00173073 time 5.011026s, mse: 0.00415287
106
+ [2026-01-09 09:30:48 root] (train_utils.py 185): INFO layer 2 lwc lac iter 9, lr 0.00125375 time 3.906071s, mse: 0.00411024
107
+ [2026-01-09 09:30:55 root] (train_utils.py 185): INFO layer 2 lwc lac iter 10, lr 0.00083135 time 7.158067s, mse: 0.00407672
108
+ [2026-01-09 09:31:02 root] (train_utils.py 185): INFO layer 2 lwc lac iter 11, lr 0.00048198 time 7.198819s, mse: 0.00404750
109
+ [2026-01-09 09:31:09 root] (train_utils.py 185): INFO layer 2 lwc lac iter 12, lr 0.00022092 time 7.203617s, mse: 0.00401742
110
+ [2026-01-09 09:31:17 root] (train_utils.py 185): INFO layer 2 lwc lac iter 13, lr 0.00005958 time 7.287185s, mse: 0.00398090
111
+ [2026-01-09 09:31:22 root] (train_utils.py 185): INFO layer 2 lwc lac iter 14, lr 0.00000500 time 5.615172s, mse: 0.00397130
112
+ [2026-01-09 09:31:23 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
113
+ [2026-01-09 09:31:23 root] (train_utils.py 108): INFO ========= Layer 3 =========
114
+ [2026-01-09 09:31:32 root] (train_utils.py 185): INFO layer 3 lwc lac iter 0, lr 0.00494542 time 6.023239s, mse: 0.02308414
115
+ [2026-01-09 09:31:36 root] (train_utils.py 185): INFO layer 3 lwc lac iter 1, lr 0.00478408 time 3.922480s, mse: 0.01333557
116
+ [2026-01-09 09:31:41 root] (train_utils.py 185): INFO layer 3 lwc lac iter 2, lr 0.00452302 time 4.711427s, mse: 0.01099337
117
+ [2026-01-09 09:31:48 root] (train_utils.py 185): INFO layer 3 lwc lac iter 3, lr 0.00417365 time 7.597089s, mse: 0.01028412
118
+ [2026-01-09 09:31:56 root] (train_utils.py 185): INFO layer 3 lwc lac iter 4, lr 0.00375125 time 7.595494s, mse: 0.01000082
119
+ [2026-01-09 09:32:04 root] (train_utils.py 185): INFO layer 3 lwc lac iter 5, lr 0.00327427 time 7.568671s, mse: 0.00980410
120
+ [2026-01-09 09:32:11 root] (train_utils.py 185): INFO layer 3 lwc lac iter 6, lr 0.00276356 time 7.582738s, mse: 0.00969286
121
+ [2026-01-09 09:32:17 root] (train_utils.py 185): INFO layer 3 lwc lac iter 7, lr 0.00224144 time 6.011546s, mse: 0.00956387
122
+ [2026-01-09 09:32:22 root] (train_utils.py 185): INFO layer 3 lwc lac iter 8, lr 0.00173073 time 4.549473s, mse: 0.00946260
123
+ [2026-01-09 09:32:26 root] (train_utils.py 185): INFO layer 3 lwc lac iter 9, lr 0.00125375 time 4.490500s, mse: 0.00937346
124
+ [2026-01-09 09:32:31 root] (train_utils.py 185): INFO layer 3 lwc lac iter 10, lr 0.00083135 time 4.534710s, mse: 0.00926330
125
+ [2026-01-09 09:32:35 root] (train_utils.py 185): INFO layer 3 lwc lac iter 11, lr 0.00048198 time 4.458297s, mse: 0.00916464
126
+ [2026-01-09 09:32:39 root] (train_utils.py 185): INFO layer 3 lwc lac iter 12, lr 0.00022092 time 3.922243s, mse: 0.00907166
127
+ [2026-01-09 09:32:43 root] (train_utils.py 185): INFO layer 3 lwc lac iter 13, lr 0.00005958 time 3.890109s, mse: 0.00904066
128
+ [2026-01-09 09:32:47 root] (train_utils.py 185): INFO layer 3 lwc lac iter 14, lr 0.00000500 time 3.885668s, mse: 0.00900416
129
+ [2026-01-09 09:32:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
130
+ [2026-01-09 09:32:48 root] (train_utils.py 108): INFO ========= Layer 4 =========
131
+ [2026-01-09 09:32:59 root] (train_utils.py 185): INFO layer 4 lwc lac iter 0, lr 0.00494542 time 8.274354s, mse: 0.06576648
132
+ [2026-01-09 09:33:07 root] (train_utils.py 185): INFO layer 4 lwc lac iter 1, lr 0.00478408 time 8.427314s, mse: 0.03741666
133
+ [2026-01-09 09:33:15 root] (train_utils.py 185): INFO layer 4 lwc lac iter 2, lr 0.00452302 time 8.378025s, mse: 0.03053248
134
+ [2026-01-09 09:33:24 root] (train_utils.py 185): INFO layer 4 lwc lac iter 3, lr 0.00417365 time 8.409590s, mse: 0.02855516
135
+ [2026-01-09 09:33:32 root] (train_utils.py 185): INFO layer 4 lwc lac iter 4, lr 0.00375125 time 8.404675s, mse: 0.02790034
136
+ [2026-01-09 09:33:39 root] (train_utils.py 185): INFO layer 4 lwc lac iter 5, lr 0.00327427 time 6.628190s, mse: 0.02746365
137
+ [2026-01-09 09:33:43 root] (train_utils.py 185): INFO layer 4 lwc lac iter 6, lr 0.00276356 time 3.947360s, mse: 0.02716962
138
+ [2026-01-09 09:33:47 root] (train_utils.py 185): INFO layer 4 lwc lac iter 7, lr 0.00224144 time 3.893767s, mse: 0.02687641
139
+ [2026-01-09 09:33:51 root] (train_utils.py 185): INFO layer 4 lwc lac iter 8, lr 0.00173073 time 3.886842s, mse: 0.02662238
140
+ [2026-01-09 09:33:56 root] (train_utils.py 185): INFO layer 4 lwc lac iter 9, lr 0.00125375 time 5.446397s, mse: 0.02643147
141
+ [2026-01-09 09:34:04 root] (train_utils.py 185): INFO layer 4 lwc lac iter 10, lr 0.00083135 time 8.440320s, mse: 0.02624781
142
+ [2026-01-09 09:34:13 root] (train_utils.py 185): INFO layer 4 lwc lac iter 11, lr 0.00048198 time 8.405491s, mse: 0.02604026
143
+ [2026-01-09 09:34:21 root] (train_utils.py 185): INFO layer 4 lwc lac iter 12, lr 0.00022092 time 8.418950s, mse: 0.02585863
144
+ [2026-01-09 09:34:30 root] (train_utils.py 185): INFO layer 4 lwc lac iter 13, lr 0.00005958 time 8.423032s, mse: 0.02578292
145
+ [2026-01-09 09:34:38 root] (train_utils.py 185): INFO layer 4 lwc lac iter 14, lr 0.00000500 time 8.420124s, mse: 0.02572995
146
+ [2026-01-09 09:34:39 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
147
+ [2026-01-09 09:34:39 root] (train_utils.py 108): INFO ========= Layer 5 =========
148
+ [2026-01-09 09:34:47 root] (train_utils.py 185): INFO layer 5 lwc lac iter 0, lr 0.00494542 time 5.174283s, mse: 0.13743916
149
+ [2026-01-09 09:34:51 root] (train_utils.py 185): INFO layer 5 lwc lac iter 1, lr 0.00478408 time 4.443467s, mse: 0.08057592
150
+ [2026-01-09 09:34:56 root] (train_utils.py 185): INFO layer 5 lwc lac iter 2, lr 0.00452302 time 4.403272s, mse: 0.06617787
151
+ [2026-01-09 09:35:00 root] (train_utils.py 185): INFO layer 5 lwc lac iter 3, lr 0.00417365 time 4.398644s, mse: 0.06287611
152
+ [2026-01-09 09:35:04 root] (train_utils.py 185): INFO layer 5 lwc lac iter 4, lr 0.00375125 time 4.206020s, mse: 0.06213523
153
+ [2026-01-09 09:35:08 root] (train_utils.py 185): INFO layer 5 lwc lac iter 5, lr 0.00327427 time 3.923909s, mse: 0.06160403
154
+ [2026-01-09 09:35:12 root] (train_utils.py 185): INFO layer 5 lwc lac iter 6, lr 0.00276356 time 3.888113s, mse: 0.06119698
155
+ [2026-01-09 09:35:16 root] (train_utils.py 185): INFO layer 5 lwc lac iter 7, lr 0.00224144 time 3.887599s, mse: 0.06094177
156
+ [2026-01-09 09:35:20 root] (train_utils.py 185): INFO layer 5 lwc lac iter 8, lr 0.00173073 time 3.892878s, mse: 0.06060794
157
+ [2026-01-09 09:35:24 root] (train_utils.py 185): INFO layer 5 lwc lac iter 9, lr 0.00125375 time 3.896571s, mse: 0.06020888
158
+ [2026-01-09 09:35:28 root] (train_utils.py 185): INFO layer 5 lwc lac iter 10, lr 0.00083135 time 3.892194s, mse: 0.05995716
159
+ [2026-01-09 09:35:31 root] (train_utils.py 185): INFO layer 5 lwc lac iter 11, lr 0.00048198 time 3.886368s, mse: 0.05978661
160
+ [2026-01-09 09:35:35 root] (train_utils.py 185): INFO layer 5 lwc lac iter 12, lr 0.00022092 time 3.901675s, mse: 0.05955682
161
+ [2026-01-09 09:35:39 root] (train_utils.py 185): INFO layer 5 lwc lac iter 13, lr 0.00005958 time 3.896321s, mse: 0.05938030
162
+ [2026-01-09 09:35:43 root] (train_utils.py 185): INFO layer 5 lwc lac iter 14, lr 0.00000500 time 3.965801s, mse: 0.05934311
163
+ [2026-01-09 09:35:44 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
164
+ [2026-01-09 09:35:44 root] (train_utils.py 108): INFO ========= Layer 6 =========
165
+ [2026-01-09 09:35:51 root] (train_utils.py 185): INFO layer 6 lwc lac iter 0, lr 0.00494542 time 4.703084s, mse: 1.86451793
166
+ [2026-01-09 09:35:55 root] (train_utils.py 185): INFO layer 6 lwc lac iter 1, lr 0.00478408 time 3.884077s, mse: 0.35658583
167
+ [2026-01-09 09:35:59 root] (train_utils.py 185): INFO layer 6 lwc lac iter 2, lr 0.00452302 time 3.885417s, mse: 0.32737118
168
+ [2026-01-09 09:36:03 root] (train_utils.py 185): INFO layer 6 lwc lac iter 3, lr 0.00417365 time 3.887404s, mse: 0.28929594
169
+ [2026-01-09 09:36:07 root] (train_utils.py 185): INFO layer 6 lwc lac iter 4, lr 0.00375125 time 3.931743s, mse: 0.24128482
170
+ [2026-01-09 09:36:11 root] (train_utils.py 185): INFO layer 6 lwc lac iter 5, lr 0.00327427 time 3.886941s, mse: 0.21027605
171
+ [2026-01-09 09:36:15 root] (train_utils.py 185): INFO layer 6 lwc lac iter 6, lr 0.00276356 time 3.900860s, mse: 0.25483868
172
+ [2026-01-09 09:36:19 root] (train_utils.py 185): INFO layer 6 lwc lac iter 7, lr 0.00224144 time 3.881273s, mse: 0.23871142
173
+ [2026-01-09 09:36:22 root] (train_utils.py 185): INFO layer 6 lwc lac iter 8, lr 0.00173073 time 3.888987s, mse: 0.21885920
174
+ [2026-01-09 09:36:26 root] (train_utils.py 185): INFO layer 6 lwc lac iter 9, lr 0.00125375 time 3.890998s, mse: 0.20672695
175
+ [2026-01-09 09:36:30 root] (train_utils.py 185): INFO layer 6 lwc lac iter 10, lr 0.00083135 time 3.887677s, mse: 0.20202750
176
+ [2026-01-09 09:36:34 root] (train_utils.py 185): INFO layer 6 lwc lac iter 11, lr 0.00048198 time 3.885399s, mse: 0.17932597
177
+ [2026-01-09 09:36:38 root] (train_utils.py 185): INFO layer 6 lwc lac iter 12, lr 0.00022092 time 3.888746s, mse: 0.20257902
178
+ [2026-01-09 09:36:42 root] (train_utils.py 185): INFO layer 6 lwc lac iter 13, lr 0.00005958 time 3.932971s, mse: 0.20667967
179
+ [2026-01-09 09:36:46 root] (train_utils.py 185): INFO layer 6 lwc lac iter 14, lr 0.00000500 time 3.969469s, mse: 0.16777667
180
+ [2026-01-09 09:36:46 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
181
+ [2026-01-09 09:36:47 root] (train_utils.py 108): INFO ========= Layer 7 =========
182
+ [2026-01-09 09:36:54 root] (train_utils.py 185): INFO layer 7 lwc lac iter 0, lr 0.00494542 time 4.668406s, mse: 0.23462753
183
+ [2026-01-09 09:36:58 root] (train_utils.py 185): INFO layer 7 lwc lac iter 1, lr 0.00478408 time 3.981027s, mse: 0.14976017
184
+ [2026-01-09 09:37:02 root] (train_utils.py 185): INFO layer 7 lwc lac iter 2, lr 0.00452302 time 3.976696s, mse: 0.12312289
185
+ [2026-01-09 09:37:06 root] (train_utils.py 185): INFO layer 7 lwc lac iter 3, lr 0.00417365 time 3.965072s, mse: 0.11779824
186
+ [2026-01-09 09:37:10 root] (train_utils.py 185): INFO layer 7 lwc lac iter 4, lr 0.00375125 time 3.976833s, mse: 0.11621600
187
+ [2026-01-09 09:37:14 root] (train_utils.py 185): INFO layer 7 lwc lac iter 5, lr 0.00327427 time 3.983180s, mse: 0.11538153
188
+ [2026-01-09 09:37:18 root] (train_utils.py 185): INFO layer 7 lwc lac iter 6, lr 0.00276356 time 3.975625s, mse: 0.11461711
189
+ [2026-01-09 09:37:22 root] (train_utils.py 185): INFO layer 7 lwc lac iter 7, lr 0.00224144 time 3.975661s, mse: 0.11396322
190
+ [2026-01-09 09:37:26 root] (train_utils.py 185): INFO layer 7 lwc lac iter 8, lr 0.00173073 time 3.959342s, mse: 0.11346199
191
+ [2026-01-09 09:37:30 root] (train_utils.py 185): INFO layer 7 lwc lac iter 9, lr 0.00125375 time 3.949084s, mse: 0.11303829
192
+ [2026-01-09 09:37:34 root] (train_utils.py 185): INFO layer 7 lwc lac iter 10, lr 0.00083135 time 4.273962s, mse: 0.11244514
193
+ [2026-01-09 09:37:38 root] (train_utils.py 185): INFO layer 7 lwc lac iter 11, lr 0.00048198 time 4.313030s, mse: 0.11193727
194
+ [2026-01-09 09:37:43 root] (train_utils.py 185): INFO layer 7 lwc lac iter 12, lr 0.00022092 time 4.331037s, mse: 0.11167257
195
+ [2026-01-09 09:37:47 root] (train_utils.py 185): INFO layer 7 lwc lac iter 13, lr 0.00005958 time 4.272573s, mse: 0.11139309
196
+ [2026-01-09 09:37:51 root] (train_utils.py 185): INFO layer 7 lwc lac iter 14, lr 0.00000500 time 4.334657s, mse: 0.11127126
197
+ [2026-01-09 09:37:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
198
+ [2026-01-09 09:37:52 root] (train_utils.py 108): INFO ========= Layer 8 =========
199
+ [2026-01-09 09:38:00 root] (train_utils.py 185): INFO layer 8 lwc lac iter 0, lr 0.00494542 time 4.868927s, mse: 0.31783378
200
+ [2026-01-09 09:38:04 root] (train_utils.py 185): INFO layer 8 lwc lac iter 1, lr 0.00478408 time 4.520718s, mse: 0.21154313
201
+ [2026-01-09 09:38:12 root] (train_utils.py 185): INFO layer 8 lwc lac iter 2, lr 0.00452302 time 7.594991s, mse: 0.17556834
202
+ [2026-01-09 09:38:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 3, lr 0.00417365 time 7.589824s, mse: 0.16892871
203
+ [2026-01-09 09:38:27 root] (train_utils.py 185): INFO layer 8 lwc lac iter 4, lr 0.00375125 time 7.583855s, mse: 0.16700211
204
+ [2026-01-09 09:38:35 root] (train_utils.py 185): INFO layer 8 lwc lac iter 5, lr 0.00327427 time 7.586124s, mse: 0.16594610
205
+ [2026-01-09 09:38:40 root] (train_utils.py 185): INFO layer 8 lwc lac iter 6, lr 0.00276356 time 5.884252s, mse: 0.16510613
206
+ [2026-01-09 09:38:45 root] (train_utils.py 185): INFO layer 8 lwc lac iter 7, lr 0.00224144 time 4.319697s, mse: 0.16456470
207
+ [2026-01-09 09:38:52 root] (train_utils.py 185): INFO layer 8 lwc lac iter 8, lr 0.00173073 time 7.242422s, mse: 0.16401851
208
+ [2026-01-09 09:38:59 root] (train_utils.py 185): INFO layer 8 lwc lac iter 9, lr 0.00125375 time 7.228258s, mse: 0.16352586
209
+ [2026-01-09 09:39:06 root] (train_utils.py 185): INFO layer 8 lwc lac iter 10, lr 0.00083135 time 7.222845s, mse: 0.16331530
210
+ [2026-01-09 09:39:14 root] (train_utils.py 185): INFO layer 8 lwc lac iter 11, lr 0.00048198 time 7.226355s, mse: 0.16285881
211
+ [2026-01-09 09:39:19 root] (train_utils.py 185): INFO layer 8 lwc lac iter 12, lr 0.00022092 time 4.796606s, mse: 0.16254890
212
+ [2026-01-09 09:39:23 root] (train_utils.py 185): INFO layer 8 lwc lac iter 13, lr 0.00005958 time 4.818619s, mse: 0.16240378
213
+ [2026-01-09 09:39:31 root] (train_utils.py 185): INFO layer 8 lwc lac iter 14, lr 0.00000500 time 7.213155s, mse: 0.16246043
214
+ [2026-01-09 09:39:31 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
215
+ [2026-01-09 09:39:31 root] (train_utils.py 108): INFO ========= Layer 9 =========
216
+ [2026-01-09 09:39:43 root] (train_utils.py 185): INFO layer 9 lwc lac iter 0, lr 0.00494542 time 8.041505s, mse: 0.37875688
217
+ [2026-01-09 09:39:50 root] (train_utils.py 185): INFO layer 9 lwc lac iter 1, lr 0.00478408 time 7.230130s, mse: 0.25363240
218
+ [2026-01-09 09:39:56 root] (train_utils.py 185): INFO layer 9 lwc lac iter 2, lr 0.00452302 time 5.053484s, mse: 0.21064380
219
+ [2026-01-09 09:40:01 root] (train_utils.py 185): INFO layer 9 lwc lac iter 3, lr 0.00417365 time 5.305918s, mse: 0.20179385
220
+ [2026-01-09 09:40:05 root] (train_utils.py 185): INFO layer 9 lwc lac iter 4, lr 0.00375125 time 4.621566s, mse: 0.19936548
221
+ [2026-01-09 09:40:09 root] (train_utils.py 185): INFO layer 9 lwc lac iter 5, lr 0.00327427 time 3.884829s, mse: 0.19817175
222
+ [2026-01-09 09:40:15 root] (train_utils.py 185): INFO layer 9 lwc lac iter 6, lr 0.00276356 time 5.358749s, mse: 0.19703594
223
+ [2026-01-09 09:40:22 root] (train_utils.py 185): INFO layer 9 lwc lac iter 7, lr 0.00224144 time 7.561985s, mse: 0.19626960
224
+ [2026-01-09 09:40:30 root] (train_utils.py 185): INFO layer 9 lwc lac iter 8, lr 0.00173073 time 7.595258s, mse: 0.19534998
225
+ [2026-01-09 09:40:37 root] (train_utils.py 185): INFO layer 9 lwc lac iter 9, lr 0.00125375 time 7.582781s, mse: 0.19473058
226
+ [2026-01-09 09:40:45 root] (train_utils.py 185): INFO layer 9 lwc lac iter 10, lr 0.00083135 time 7.686873s, mse: 0.19404019
227
+ [2026-01-09 09:40:50 root] (train_utils.py 185): INFO layer 9 lwc lac iter 11, lr 0.00048198 time 5.390258s, mse: 0.19356999
228
+ [2026-01-09 09:40:55 root] (train_utils.py 185): INFO layer 9 lwc lac iter 12, lr 0.00022092 time 4.492068s, mse: 0.19326007
229
+ [2026-01-09 09:40:59 root] (train_utils.py 185): INFO layer 9 lwc lac iter 13, lr 0.00005958 time 4.497151s, mse: 0.19282311
230
+ [2026-01-09 09:41:04 root] (train_utils.py 185): INFO layer 9 lwc lac iter 14, lr 0.00000500 time 4.535296s, mse: 0.19267595
231
+ [2026-01-09 09:41:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
232
+ [2026-01-09 09:41:05 root] (train_utils.py 108): INFO ========= Layer 10 =========
233
+ [2026-01-09 09:41:13 root] (train_utils.py 185): INFO layer 10 lwc lac iter 0, lr 0.00494542 time 4.746660s, mse: 0.44592521
234
+ [2026-01-09 09:41:17 root] (train_utils.py 185): INFO layer 10 lwc lac iter 1, lr 0.00478408 time 3.956183s, mse: 0.28058022
235
+ [2026-01-09 09:41:21 root] (train_utils.py 185): INFO layer 10 lwc lac iter 2, lr 0.00452302 time 3.949066s, mse: 0.22870731
236
+ [2026-01-09 09:41:28 root] (train_utils.py 185): INFO layer 10 lwc lac iter 3, lr 0.00417365 time 7.370824s, mse: 0.21672769
237
+ [2026-01-09 09:41:36 root] (train_utils.py 185): INFO layer 10 lwc lac iter 4, lr 0.00375125 time 8.410934s, mse: 0.21354958
238
+ [2026-01-09 09:41:45 root] (train_utils.py 185): INFO layer 10 lwc lac iter 5, lr 0.00327427 time 8.433516s, mse: 0.21149486
239
+ [2026-01-09 09:41:53 root] (train_utils.py 185): INFO layer 10 lwc lac iter 6, lr 0.00276356 time 8.380215s, mse: 0.21045262
240
+ [2026-01-09 09:42:02 root] (train_utils.py 185): INFO layer 10 lwc lac iter 7, lr 0.00224144 time 8.411274s, mse: 0.20926467
241
+ [2026-01-09 09:42:08 root] (train_utils.py 185): INFO layer 10 lwc lac iter 8, lr 0.00173073 time 6.738219s, mse: 0.20823501
242
+ [2026-01-09 09:42:12 root] (train_utils.py 185): INFO layer 10 lwc lac iter 9, lr 0.00125375 time 3.919702s, mse: 0.20746952
243
+ [2026-01-09 09:42:16 root] (train_utils.py 185): INFO layer 10 lwc lac iter 10, lr 0.00083135 time 3.885594s, mse: 0.20690618
244
+ [2026-01-09 09:42:20 root] (train_utils.py 185): INFO layer 10 lwc lac iter 11, lr 0.00048198 time 3.890161s, mse: 0.20613439
245
+ [2026-01-09 09:42:24 root] (train_utils.py 185): INFO layer 10 lwc lac iter 12, lr 0.00022092 time 3.880738s, mse: 0.20562243
246
+ [2026-01-09 09:42:31 root] (train_utils.py 185): INFO layer 10 lwc lac iter 13, lr 0.00005958 time 7.122962s, mse: 0.20517452
247
+ [2026-01-09 09:42:39 root] (train_utils.py 185): INFO layer 10 lwc lac iter 14, lr 0.00000500 time 8.400930s, mse: 0.20504668
248
+ [2026-01-09 09:42:40 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
249
+ [2026-01-09 09:42:41 root] (train_utils.py 108): INFO ========= Layer 11 =========
250
+ [2026-01-09 09:42:54 root] (train_utils.py 185): INFO layer 11 lwc lac iter 0, lr 0.00494542 time 9.293272s, mse: 0.39262417
251
+ [2026-01-09 09:43:02 root] (train_utils.py 185): INFO layer 11 lwc lac iter 1, lr 0.00478408 time 8.410596s, mse: 0.27127978
252
+ [2026-01-09 09:43:10 root] (train_utils.py 185): INFO layer 11 lwc lac iter 2, lr 0.00452302 time 7.794915s, mse: 0.22630122
253
+ [2026-01-09 09:43:15 root] (train_utils.py 185): INFO layer 11 lwc lac iter 3, lr 0.00417365 time 4.369798s, mse: 0.21789221
254
+ [2026-01-09 09:43:19 root] (train_utils.py 185): INFO layer 11 lwc lac iter 4, lr 0.00375125 time 4.233581s, mse: 0.21573043
255
+ [2026-01-09 09:43:23 root] (train_utils.py 185): INFO layer 11 lwc lac iter 5, lr 0.00327427 time 4.393658s, mse: 0.21401882
256
+ [2026-01-09 09:43:28 root] (train_utils.py 185): INFO layer 11 lwc lac iter 6, lr 0.00276356 time 4.394202s, mse: 0.21313243
257
+ [2026-01-09 09:43:32 root] (train_utils.py 185): INFO layer 11 lwc lac iter 7, lr 0.00224144 time 4.393358s, mse: 0.21215978
258
+ [2026-01-09 09:43:36 root] (train_utils.py 185): INFO layer 11 lwc lac iter 8, lr 0.00173073 time 4.195683s, mse: 0.21121168
259
+ [2026-01-09 09:43:40 root] (train_utils.py 185): INFO layer 11 lwc lac iter 9, lr 0.00125375 time 3.884118s, mse: 0.21032479
260
+ [2026-01-09 09:43:44 root] (train_utils.py 185): INFO layer 11 lwc lac iter 10, lr 0.00083135 time 3.887403s, mse: 0.20987187
261
+ [2026-01-09 09:43:48 root] (train_utils.py 185): INFO layer 11 lwc lac iter 11, lr 0.00048198 time 3.881930s, mse: 0.20908046
262
+ [2026-01-09 09:43:55 root] (train_utils.py 185): INFO layer 11 lwc lac iter 12, lr 0.00022092 time 7.261625s, mse: 0.20848191
263
+ [2026-01-09 09:44:05 root] (train_utils.py 185): INFO layer 11 lwc lac iter 13, lr 0.00005958 time 9.507146s, mse: 0.20800886
264
+ [2026-01-09 09:44:14 root] (train_utils.py 185): INFO layer 11 lwc lac iter 14, lr 0.00000500 time 9.487793s, mse: 0.20795538
265
+ [2026-01-09 09:44:15 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
266
+ [2026-01-09 09:44:15 root] (train_utils.py 108): INFO ========= Layer 12 =========
267
+ [2026-01-09 09:44:30 root] (train_utils.py 185): INFO layer 12 lwc lac iter 0, lr 0.00494542 time 10.434841s, mse: 0.43535280
268
+ [2026-01-09 09:44:40 root] (train_utils.py 185): INFO layer 12 lwc lac iter 1, lr 0.00478408 time 9.477639s, mse: 0.29579335
269
+ [2026-01-09 09:44:49 root] (train_utils.py 185): INFO layer 12 lwc lac iter 2, lr 0.00452302 time 9.506626s, mse: 0.24488190
270
+ [2026-01-09 09:44:59 root] (train_utils.py 185): INFO layer 12 lwc lac iter 3, lr 0.00417365 time 9.498228s, mse: 0.23438135
271
+ [2026-01-09 09:45:05 root] (train_utils.py 185): INFO layer 12 lwc lac iter 4, lr 0.00375125 time 6.546145s, mse: 0.23133603
272
+ [2026-01-09 09:45:10 root] (train_utils.py 185): INFO layer 12 lwc lac iter 5, lr 0.00327427 time 4.671869s, mse: 0.22933656
273
+ [2026-01-09 09:45:15 root] (train_utils.py 185): INFO layer 12 lwc lac iter 6, lr 0.00276356 time 5.492260s, mse: 0.22804067
274
+ [2026-01-09 09:45:20 root] (train_utils.py 185): INFO layer 12 lwc lac iter 7, lr 0.00224144 time 4.276597s, mse: 0.22690852
275
+ [2026-01-09 09:45:24 root] (train_utils.py 185): INFO layer 12 lwc lac iter 8, lr 0.00173073 time 3.885321s, mse: 0.22579126
276
+ [2026-01-09 09:45:27 root] (train_utils.py 185): INFO layer 12 lwc lac iter 9, lr 0.00125375 time 3.886717s, mse: 0.22475064
277
+ [2026-01-09 09:45:31 root] (train_utils.py 185): INFO layer 12 lwc lac iter 10, lr 0.00083135 time 3.877200s, mse: 0.22366890
278
+ [2026-01-09 09:45:35 root] (train_utils.py 185): INFO layer 12 lwc lac iter 11, lr 0.00048198 time 3.898989s, mse: 0.22277188
279
+ [2026-01-09 09:45:39 root] (train_utils.py 185): INFO layer 12 lwc lac iter 12, lr 0.00022092 time 3.874586s, mse: 0.22196589
280
+ [2026-01-09 09:45:43 root] (train_utils.py 185): INFO layer 12 lwc lac iter 13, lr 0.00005958 time 3.879584s, mse: 0.22144113
281
+ [2026-01-09 09:45:47 root] (train_utils.py 185): INFO layer 12 lwc lac iter 14, lr 0.00000500 time 3.883109s, mse: 0.22116731
282
+ [2026-01-09 09:45:47 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
283
+ [2026-01-09 09:45:48 root] (train_utils.py 108): INFO ========= Layer 13 =========
284
+ [2026-01-09 09:45:55 root] (train_utils.py 185): INFO layer 13 lwc lac iter 0, lr 0.00494542 time 4.864463s, mse: 0.44991863
285
+ [2026-01-09 09:45:59 root] (train_utils.py 185): INFO layer 13 lwc lac iter 1, lr 0.00478408 time 3.966472s, mse: 0.30773303
286
+ [2026-01-09 09:46:03 root] (train_utils.py 185): INFO layer 13 lwc lac iter 2, lr 0.00452302 time 3.968499s, mse: 0.25602528
287
+ [2026-01-09 09:46:07 root] (train_utils.py 185): INFO layer 13 lwc lac iter 3, lr 0.00417365 time 3.985150s, mse: 0.24593170
288
+ [2026-01-09 09:46:11 root] (train_utils.py 185): INFO layer 13 lwc lac iter 4, lr 0.00375125 time 3.969827s, mse: 0.24332635
289
+ [2026-01-09 09:46:15 root] (train_utils.py 185): INFO layer 13 lwc lac iter 5, lr 0.00327427 time 3.975120s, mse: 0.24169515
290
+ [2026-01-09 09:46:19 root] (train_utils.py 185): INFO layer 13 lwc lac iter 6, lr 0.00276356 time 3.966719s, mse: 0.24032030
291
+ [2026-01-09 09:46:23 root] (train_utils.py 185): INFO layer 13 lwc lac iter 7, lr 0.00224144 time 3.981713s, mse: 0.23895445
292
+ [2026-01-09 09:46:27 root] (train_utils.py 185): INFO layer 13 lwc lac iter 8, lr 0.00173073 time 3.957695s, mse: 0.23795472
293
+ [2026-01-09 09:46:31 root] (train_utils.py 185): INFO layer 13 lwc lac iter 9, lr 0.00125375 time 3.928493s, mse: 0.23691620
294
+ [2026-01-09 09:46:35 root] (train_utils.py 185): INFO layer 13 lwc lac iter 10, lr 0.00083135 time 3.950974s, mse: 0.23617835
295
+ [2026-01-09 09:46:39 root] (train_utils.py 185): INFO layer 13 lwc lac iter 11, lr 0.00048198 time 4.044775s, mse: 0.23538260
296
+ [2026-01-09 09:46:43 root] (train_utils.py 185): INFO layer 13 lwc lac iter 12, lr 0.00022092 time 4.395989s, mse: 0.23459788
297
+ [2026-01-09 09:46:48 root] (train_utils.py 185): INFO layer 13 lwc lac iter 13, lr 0.00005958 time 4.461857s, mse: 0.23386008
298
+ [2026-01-09 09:46:52 root] (train_utils.py 185): INFO layer 13 lwc lac iter 14, lr 0.00000500 time 4.388942s, mse: 0.23347831
299
+ [2026-01-09 09:46:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
300
+ [2026-01-09 09:46:53 root] (train_utils.py 108): INFO ========= Layer 14 =========
301
+ [2026-01-09 09:47:01 root] (train_utils.py 185): INFO layer 14 lwc lac iter 0, lr 0.00494542 time 5.077703s, mse: 0.48670265
302
+ [2026-01-09 09:47:05 root] (train_utils.py 185): INFO layer 14 lwc lac iter 1, lr 0.00478408 time 3.912335s, mse: 0.32924685
303
+ [2026-01-09 09:47:10 root] (train_utils.py 185): INFO layer 14 lwc lac iter 2, lr 0.00452302 time 5.693116s, mse: 0.27174610
304
+ [2026-01-09 09:47:18 root] (train_utils.py 185): INFO layer 14 lwc lac iter 3, lr 0.00417365 time 7.564554s, mse: 0.26111004
305
+ [2026-01-09 09:47:25 root] (train_utils.py 185): INFO layer 14 lwc lac iter 4, lr 0.00375125 time 7.573057s, mse: 0.25857583
306
+ [2026-01-09 09:47:33 root] (train_utils.py 185): INFO layer 14 lwc lac iter 5, lr 0.00327427 time 7.558129s, mse: 0.25724220
307
+ [2026-01-09 09:47:41 root] (train_utils.py 185): INFO layer 14 lwc lac iter 6, lr 0.00276356 time 7.571431s, mse: 0.25530052
308
+ [2026-01-09 09:47:45 root] (train_utils.py 185): INFO layer 14 lwc lac iter 7, lr 0.00224144 time 4.817823s, mse: 0.25373703
309
+ [2026-01-09 09:47:49 root] (train_utils.py 185): INFO layer 14 lwc lac iter 8, lr 0.00173073 time 3.882754s, mse: 0.25232333
310
+ [2026-01-09 09:47:55 root] (train_utils.py 185): INFO layer 14 lwc lac iter 9, lr 0.00125375 time 5.495000s, mse: 0.25103748
311
+ [2026-01-09 09:48:02 root] (train_utils.py 185): INFO layer 14 lwc lac iter 10, lr 0.00083135 time 7.215655s, mse: 0.24987648
312
+ [2026-01-09 09:48:09 root] (train_utils.py 185): INFO layer 14 lwc lac iter 11, lr 0.00048198 time 7.181203s, mse: 0.24912813
313
+ [2026-01-09 09:48:16 root] (train_utils.py 185): INFO layer 14 lwc lac iter 12, lr 0.00022092 time 7.226618s, mse: 0.24813016
314
+ [2026-01-09 09:48:23 root] (train_utils.py 185): INFO layer 14 lwc lac iter 13, lr 0.00005958 time 7.023306s, mse: 0.24762598
315
+ [2026-01-09 09:48:27 root] (train_utils.py 185): INFO layer 14 lwc lac iter 14, lr 0.00000500 time 3.881495s, mse: 0.24739194
316
+ [2026-01-09 09:48:28 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
317
+ [2026-01-09 09:48:28 root] (train_utils.py 108): INFO ========= Layer 15 =========
318
+ [2026-01-09 09:48:38 root] (train_utils.py 185): INFO layer 15 lwc lac iter 0, lr 0.00494542 time 7.331558s, mse: 0.48941827
319
+ [2026-01-09 09:48:45 root] (train_utils.py 185): INFO layer 15 lwc lac iter 1, lr 0.00478408 time 7.256174s, mse: 0.32720220
320
+ [2026-01-09 09:48:52 root] (train_utils.py 185): INFO layer 15 lwc lac iter 2, lr 0.00452302 time 7.203066s, mse: 0.26854873
321
+ [2026-01-09 09:49:00 root] (train_utils.py 185): INFO layer 15 lwc lac iter 3, lr 0.00417365 time 7.211432s, mse: 0.25705975
322
+ [2026-01-09 09:49:06 root] (train_utils.py 185): INFO layer 15 lwc lac iter 4, lr 0.00375125 time 5.967390s, mse: 0.25422159
323
+ [2026-01-09 09:49:11 root] (train_utils.py 185): INFO layer 15 lwc lac iter 5, lr 0.00327427 time 5.112001s, mse: 0.25197345
324
+ [2026-01-09 09:49:16 root] (train_utils.py 185): INFO layer 15 lwc lac iter 6, lr 0.00276356 time 5.306431s, mse: 0.25026903
325
+ [2026-01-09 09:49:20 root] (train_utils.py 185): INFO layer 15 lwc lac iter 7, lr 0.00224144 time 3.906909s, mse: 0.24867499
326
+ [2026-01-09 09:49:24 root] (train_utils.py 185): INFO layer 15 lwc lac iter 8, lr 0.00173073 time 4.498396s, mse: 0.24771519
327
+ [2026-01-09 09:49:32 root] (train_utils.py 185): INFO layer 15 lwc lac iter 9, lr 0.00125375 time 7.580038s, mse: 0.24665023
328
+ [2026-01-09 09:49:40 root] (train_utils.py 185): INFO layer 15 lwc lac iter 10, lr 0.00083135 time 7.592080s, mse: 0.24558856
329
+ [2026-01-09 09:49:47 root] (train_utils.py 185): INFO layer 15 lwc lac iter 11, lr 0.00048198 time 7.600457s, mse: 0.24435455
330
+ [2026-01-09 09:49:55 root] (train_utils.py 185): INFO layer 15 lwc lac iter 12, lr 0.00022092 time 7.596599s, mse: 0.24346027
331
+ [2026-01-09 09:50:01 root] (train_utils.py 185): INFO layer 15 lwc lac iter 13, lr 0.00005958 time 6.140972s, mse: 0.24292424
332
+ [2026-01-09 09:50:06 root] (train_utils.py 185): INFO layer 15 lwc lac iter 14, lr 0.00000500 time 4.575392s, mse: 0.24260354
333
+ [2026-01-09 09:50:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
334
+ [2026-01-09 09:50:06 root] (train_utils.py 108): INFO ========= Layer 16 =========
335
+ [2026-01-09 09:50:15 root] (train_utils.py 185): INFO layer 16 lwc lac iter 0, lr 0.00494542 time 5.665595s, mse: 3.09758520
336
+ [2026-01-09 09:50:19 root] (train_utils.py 185): INFO layer 16 lwc lac iter 1, lr 0.00478408 time 3.916453s, mse: 1.53681600
337
+ [2026-01-09 09:50:23 root] (train_utils.py 185): INFO layer 16 lwc lac iter 2, lr 0.00452302 time 3.887970s, mse: 1.37538433
338
+ [2026-01-09 09:50:27 root] (train_utils.py 185): INFO layer 16 lwc lac iter 3, lr 0.00417365 time 3.882150s, mse: 1.14041376
339
+ [2026-01-09 09:50:32 root] (train_utils.py 185): INFO layer 16 lwc lac iter 4, lr 0.00375125 time 5.692225s, mse: 1.13041377
340
+ [2026-01-09 09:50:41 root] (train_utils.py 185): INFO layer 16 lwc lac iter 5, lr 0.00327427 time 8.392419s, mse: 1.17505825
341
+ [2026-01-09 09:50:49 root] (train_utils.py 185): INFO layer 16 lwc lac iter 6, lr 0.00276356 time 8.391150s, mse: 1.00187659
342
+ [2026-01-09 09:50:58 root] (train_utils.py 185): INFO layer 16 lwc lac iter 7, lr 0.00224144 time 8.387745s, mse: 1.15916288
343
+ [2026-01-09 09:51:06 root] (train_utils.py 185): INFO layer 16 lwc lac iter 8, lr 0.00173073 time 8.414940s, mse: 0.93556213
344
+ [2026-01-09 09:51:14 root] (train_utils.py 185): INFO layer 16 lwc lac iter 9, lr 0.00125375 time 8.394004s, mse: 0.89307052
345
+ [2026-01-09 09:51:18 root] (train_utils.py 185): INFO layer 16 lwc lac iter 10, lr 0.00083135 time 3.919834s, mse: 1.08854449
346
+ [2026-01-09 09:51:22 root] (train_utils.py 185): INFO layer 16 lwc lac iter 11, lr 0.00048198 time 3.896248s, mse: 0.78587675
347
+ [2026-01-09 09:51:26 root] (train_utils.py 185): INFO layer 16 lwc lac iter 12, lr 0.00022092 time 3.886925s, mse: 0.77024889
348
+ [2026-01-09 09:51:30 root] (train_utils.py 185): INFO layer 16 lwc lac iter 13, lr 0.00005958 time 3.890127s, mse: 0.74143833
349
+ [2026-01-09 09:51:34 root] (train_utils.py 185): INFO layer 16 lwc lac iter 14, lr 0.00000500 time 3.933678s, mse: 0.62904388
350
+ [2026-01-09 09:51:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
351
+ [2026-01-09 09:51:35 root] (train_utils.py 108): INFO ========= Layer 17 =========
352
+ [2026-01-09 09:51:49 root] (train_utils.py 185): INFO layer 17 lwc lac iter 0, lr 0.00494542 time 9.861759s, mse: 0.57632238
353
+ [2026-01-09 09:51:58 root] (train_utils.py 185): INFO layer 17 lwc lac iter 1, lr 0.00478408 time 8.438567s, mse: 0.38568184
354
+ [2026-01-09 09:52:06 root] (train_utils.py 185): INFO layer 17 lwc lac iter 2, lr 0.00452302 time 8.434239s, mse: 0.30990756
355
+ [2026-01-09 09:52:14 root] (train_utils.py 185): INFO layer 17 lwc lac iter 3, lr 0.00417365 time 8.419053s, mse: 0.29348093
356
+ [2026-01-09 09:52:20 root] (train_utils.py 185): INFO layer 17 lwc lac iter 4, lr 0.00375125 time 5.675947s, mse: 0.28841209
357
+ [2026-01-09 09:52:25 root] (train_utils.py 185): INFO layer 17 lwc lac iter 5, lr 0.00327427 time 4.414869s, mse: 0.28536177
358
+ [2026-01-09 09:52:29 root] (train_utils.py 185): INFO layer 17 lwc lac iter 6, lr 0.00276356 time 4.475958s, mse: 0.28336507
359
+ [2026-01-09 09:52:33 root] (train_utils.py 185): INFO layer 17 lwc lac iter 7, lr 0.00224144 time 4.307231s, mse: 0.28023016
360
+ [2026-01-09 09:52:38 root] (train_utils.py 185): INFO layer 17 lwc lac iter 8, lr 0.00173073 time 4.318885s, mse: 0.27797151
361
+ [2026-01-09 09:52:42 root] (train_utils.py 185): INFO layer 17 lwc lac iter 9, lr 0.00125375 time 4.301153s, mse: 0.27724716
362
+ [2026-01-09 09:52:46 root] (train_utils.py 185): INFO layer 17 lwc lac iter 10, lr 0.00083135 time 3.976857s, mse: 0.27549568
363
+ [2026-01-09 09:52:50 root] (train_utils.py 185): INFO layer 17 lwc lac iter 11, lr 0.00048198 time 3.880923s, mse: 0.27411795
364
+ [2026-01-09 09:52:54 root] (train_utils.py 185): INFO layer 17 lwc lac iter 12, lr 0.00022092 time 3.900098s, mse: 0.27230272
365
+ [2026-01-09 09:52:58 root] (train_utils.py 185): INFO layer 17 lwc lac iter 13, lr 0.00005958 time 3.882901s, mse: 0.27161792
366
+ [2026-01-09 09:53:05 root] (train_utils.py 185): INFO layer 17 lwc lac iter 14, lr 0.00000500 time 6.952255s, mse: 0.27142629
367
+ [2026-01-09 09:53:05 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
368
+ [2026-01-09 09:53:06 root] (train_utils.py 108): INFO ========= Layer 18 =========
369
+ [2026-01-09 09:53:21 root] (train_utils.py 185): INFO layer 18 lwc lac iter 0, lr 0.00494542 time 10.575045s, mse: 0.68219566
370
+ [2026-01-09 09:53:30 root] (train_utils.py 185): INFO layer 18 lwc lac iter 1, lr 0.00478408 time 9.497893s, mse: 0.44933167
371
+ [2026-01-09 09:53:40 root] (train_utils.py 185): INFO layer 18 lwc lac iter 2, lr 0.00452302 time 9.470748s, mse: 0.36149144
372
+ [2026-01-09 09:53:49 root] (train_utils.py 185): INFO layer 18 lwc lac iter 3, lr 0.00417365 time 9.480987s, mse: 0.34437451
373
+ [2026-01-09 09:53:59 root] (train_utils.py 185): INFO layer 18 lwc lac iter 4, lr 0.00375125 time 9.508292s, mse: 0.33928376
374
+ [2026-01-09 09:54:08 root] (train_utils.py 185): INFO layer 18 lwc lac iter 5, lr 0.00327427 time 9.476126s, mse: 0.33628541
375
+ [2026-01-09 09:54:15 root] (train_utils.py 185): INFO layer 18 lwc lac iter 6, lr 0.00276356 time 6.729955s, mse: 0.33380261
376
+ [2026-01-09 09:54:19 root] (train_utils.py 185): INFO layer 18 lwc lac iter 7, lr 0.00224144 time 4.353182s, mse: 0.33132178
377
+ [2026-01-09 09:54:24 root] (train_utils.py 185): INFO layer 18 lwc lac iter 8, lr 0.00173073 time 4.857163s, mse: 0.32943395
378
+ [2026-01-09 09:54:29 root] (train_utils.py 185): INFO layer 18 lwc lac iter 9, lr 0.00125375 time 5.068378s, mse: 0.32786560
379
+ [2026-01-09 09:54:33 root] (train_utils.py 185): INFO layer 18 lwc lac iter 10, lr 0.00083135 time 3.990787s, mse: 0.32583937
380
+ [2026-01-09 09:54:37 root] (train_utils.py 185): INFO layer 18 lwc lac iter 11, lr 0.00048198 time 3.884449s, mse: 0.32450172
381
+ [2026-01-09 09:54:41 root] (train_utils.py 185): INFO layer 18 lwc lac iter 12, lr 0.00022092 time 3.886384s, mse: 0.32264820
382
+ [2026-01-09 09:54:45 root] (train_utils.py 185): INFO layer 18 lwc lac iter 13, lr 0.00005958 time 3.888823s, mse: 0.32187557
383
+ [2026-01-09 09:54:49 root] (train_utils.py 185): INFO layer 18 lwc lac iter 14, lr 0.00000500 time 3.888052s, mse: 0.32105669
384
+ [2026-01-09 09:54:49 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
385
+ [2026-01-09 09:54:50 root] (train_utils.py 108): INFO ========= Layer 19 =========
386
+ [2026-01-09 09:54:57 root] (train_utils.py 185): INFO layer 19 lwc lac iter 0, lr 0.00494542 time 4.764529s, mse: 0.88728219
387
+ [2026-01-09 09:55:01 root] (train_utils.py 185): INFO layer 19 lwc lac iter 1, lr 0.00478408 time 3.909924s, mse: 0.57078516
388
+ [2026-01-09 09:55:05 root] (train_utils.py 185): INFO layer 19 lwc lac iter 2, lr 0.00452302 time 4.041203s, mse: 0.45792666
389
+ [2026-01-09 09:55:09 root] (train_utils.py 185): INFO layer 19 lwc lac iter 3, lr 0.00417365 time 3.979156s, mse: 0.43537480
390
+ [2026-01-09 09:55:13 root] (train_utils.py 185): INFO layer 19 lwc lac iter 4, lr 0.00375125 time 3.976704s, mse: 0.42894897
391
+ [2026-01-09 09:55:17 root] (train_utils.py 185): INFO layer 19 lwc lac iter 5, lr 0.00327427 time 3.986418s, mse: 0.42462113
392
+ [2026-01-09 09:55:21 root] (train_utils.py 185): INFO layer 19 lwc lac iter 6, lr 0.00276356 time 3.974231s, mse: 0.42157629
393
+ [2026-01-09 09:55:25 root] (train_utils.py 185): INFO layer 19 lwc lac iter 7, lr 0.00224144 time 3.975169s, mse: 0.41864219
394
+ [2026-01-09 09:55:29 root] (train_utils.py 185): INFO layer 19 lwc lac iter 8, lr 0.00173073 time 3.985224s, mse: 0.41570342
395
+ [2026-01-09 09:55:33 root] (train_utils.py 185): INFO layer 19 lwc lac iter 9, lr 0.00125375 time 3.964523s, mse: 0.41345572
396
+ [2026-01-09 09:55:37 root] (train_utils.py 185): INFO layer 19 lwc lac iter 10, lr 0.00083135 time 3.971512s, mse: 0.41054672
397
+ [2026-01-09 09:55:41 root] (train_utils.py 185): INFO layer 19 lwc lac iter 11, lr 0.00048198 time 3.981757s, mse: 0.40846488
398
+ [2026-01-09 09:55:45 root] (train_utils.py 185): INFO layer 19 lwc lac iter 12, lr 0.00022092 time 3.930552s, mse: 0.40727249
399
+ [2026-01-09 09:55:48 root] (train_utils.py 185): INFO layer 19 lwc lac iter 13, lr 0.00005958 time 3.955016s, mse: 0.40628025
400
+ [2026-01-09 09:55:53 root] (train_utils.py 185): INFO layer 19 lwc lac iter 14, lr 0.00000500 time 4.286919s, mse: 0.40573606
401
+ [2026-01-09 09:55:53 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
402
+ [2026-01-09 09:55:54 root] (train_utils.py 108): INFO ========= Layer 20 =========
403
+ [2026-01-09 09:56:02 root] (train_utils.py 185): INFO layer 20 lwc lac iter 0, lr 0.00494542 time 5.274112s, mse: 0.88836050
404
+ [2026-01-09 09:56:06 root] (train_utils.py 185): INFO layer 20 lwc lac iter 1, lr 0.00478408 time 4.440670s, mse: 0.59483135
405
+ [2026-01-09 09:56:11 root] (train_utils.py 185): INFO layer 20 lwc lac iter 2, lr 0.00452302 time 4.416258s, mse: 0.48579982
406
+ [2026-01-09 09:56:14 root] (train_utils.py 185): INFO layer 20 lwc lac iter 3, lr 0.00417365 time 3.951197s, mse: 0.46583182
407
+ [2026-01-09 09:56:19 root] (train_utils.py 185): INFO layer 20 lwc lac iter 4, lr 0.00375125 time 4.961442s, mse: 0.46044937
408
+ [2026-01-09 09:56:27 root] (train_utils.py 185): INFO layer 20 lwc lac iter 5, lr 0.00327427 time 7.579243s, mse: 0.45749170
409
+ [2026-01-09 09:56:35 root] (train_utils.py 185): INFO layer 20 lwc lac iter 6, lr 0.00276356 time 7.570611s, mse: 0.45316568
410
+ [2026-01-09 09:56:42 root] (train_utils.py 185): INFO layer 20 lwc lac iter 7, lr 0.00224144 time 7.582004s, mse: 0.45053339
411
+ [2026-01-09 09:56:50 root] (train_utils.py 185): INFO layer 20 lwc lac iter 8, lr 0.00173073 time 7.588964s, mse: 0.44832462
412
+ [2026-01-09 09:56:55 root] (train_utils.py 185): INFO layer 20 lwc lac iter 9, lr 0.00125375 time 5.473780s, mse: 0.44616416
413
+ [2026-01-09 09:56:59 root] (train_utils.py 185): INFO layer 20 lwc lac iter 10, lr 0.00083135 time 3.885696s, mse: 0.44334349
414
+ [2026-01-09 09:57:04 root] (train_utils.py 185): INFO layer 20 lwc lac iter 11, lr 0.00048198 time 4.542222s, mse: 0.44204527
415
+ [2026-01-09 09:57:11 root] (train_utils.py 185): INFO layer 20 lwc lac iter 12, lr 0.00022092 time 7.207450s, mse: 0.43987796
416
+ [2026-01-09 09:57:18 root] (train_utils.py 185): INFO layer 20 lwc lac iter 13, lr 0.00005958 time 7.203997s, mse: 0.43863490
417
+ [2026-01-09 09:57:25 root] (train_utils.py 185): INFO layer 20 lwc lac iter 14, lr 0.00000500 time 7.229440s, mse: 0.43791217
418
+ [2026-01-09 09:57:26 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
419
+ [2026-01-09 09:57:26 root] (train_utils.py 108): INFO ========= Layer 21 =========
420
+ [2026-01-09 09:57:35 root] (train_utils.py 185): INFO layer 21 lwc lac iter 0, lr 0.00494542 time 5.521300s, mse: 1.18043423
421
+ [2026-01-09 09:57:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 1, lr 0.00478408 time 3.878793s, mse: 0.77954561
422
+ [2026-01-09 09:57:45 root] (train_utils.py 185): INFO layer 21 lwc lac iter 2, lr 0.00452302 time 5.689105s, mse: 0.64111829
423
+ [2026-01-09 09:57:52 root] (train_utils.py 185): INFO layer 21 lwc lac iter 3, lr 0.00417365 time 7.237405s, mse: 0.61397409
424
+ [2026-01-09 09:58:00 root] (train_utils.py 185): INFO layer 21 lwc lac iter 4, lr 0.00375125 time 7.234711s, mse: 0.60631013
425
+ [2026-01-09 09:58:07 root] (train_utils.py 185): INFO layer 21 lwc lac iter 5, lr 0.00327427 time 7.257921s, mse: 0.60047567
426
+ [2026-01-09 09:58:13 root] (train_utils.py 185): INFO layer 21 lwc lac iter 6, lr 0.00276356 time 6.723596s, mse: 0.59512597
427
+ [2026-01-09 09:58:18 root] (train_utils.py 185): INFO layer 21 lwc lac iter 7, lr 0.00224144 time 4.947057s, mse: 0.59215677
428
+ [2026-01-09 09:58:24 root] (train_utils.py 185): INFO layer 21 lwc lac iter 8, lr 0.00173073 time 5.440034s, mse: 0.58796024
429
+ [2026-01-09 09:58:28 root] (train_utils.py 185): INFO layer 21 lwc lac iter 9, lr 0.00125375 time 4.097990s, mse: 0.58513182
430
+ [2026-01-09 09:58:32 root] (train_utils.py 185): INFO layer 21 lwc lac iter 10, lr 0.00083135 time 3.881567s, mse: 0.58225924
431
+ [2026-01-09 09:58:39 root] (train_utils.py 185): INFO layer 21 lwc lac iter 11, lr 0.00048198 time 7.507426s, mse: 0.57988369
432
+ [2026-01-09 09:58:47 root] (train_utils.py 185): INFO layer 21 lwc lac iter 12, lr 0.00022092 time 7.586020s, mse: 0.57718277
433
+ [2026-01-09 09:58:55 root] (train_utils.py 185): INFO layer 21 lwc lac iter 13, lr 0.00005958 time 7.583313s, mse: 0.57546204
434
+ [2026-01-09 09:59:02 root] (train_utils.py 185): INFO layer 21 lwc lac iter 14, lr 0.00000500 time 7.600305s, mse: 0.57469940
435
+ [2026-01-09 09:59:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
436
+ [2026-01-09 09:59:03 root] (train_utils.py 108): INFO ========= Layer 22 =========
437
+ [2026-01-09 09:59:13 root] (train_utils.py 185): INFO layer 22 lwc lac iter 0, lr 0.00494542 time 5.760921s, mse: 1.88664389
438
+ [2026-01-09 09:59:18 root] (train_utils.py 185): INFO layer 22 lwc lac iter 1, lr 0.00478408 time 4.911954s, mse: 1.18959606
439
+ [2026-01-09 09:59:22 root] (train_utils.py 185): INFO layer 22 lwc lac iter 2, lr 0.00452302 time 4.495867s, mse: 0.95907360
440
+ [2026-01-09 09:59:26 root] (train_utils.py 185): INFO layer 22 lwc lac iter 3, lr 0.00417365 time 3.928732s, mse: 0.91428280
441
+ [2026-01-09 09:59:30 root] (train_utils.py 185): INFO layer 22 lwc lac iter 4, lr 0.00375125 time 3.887553s, mse: 0.90376323
442
+ [2026-01-09 09:59:34 root] (train_utils.py 185): INFO layer 22 lwc lac iter 5, lr 0.00327427 time 3.897269s, mse: 0.89363086
443
+ [2026-01-09 09:59:41 root] (train_utils.py 185): INFO layer 22 lwc lac iter 6, lr 0.00276356 time 7.003159s, mse: 0.88751125
444
+ [2026-01-09 09:59:49 root] (train_utils.py 185): INFO layer 22 lwc lac iter 7, lr 0.00224144 time 8.405993s, mse: 0.87932986
445
+ [2026-01-09 09:59:58 root] (train_utils.py 185): INFO layer 22 lwc lac iter 8, lr 0.00173073 time 8.416431s, mse: 0.87506205
446
+ [2026-01-09 10:00:06 root] (train_utils.py 185): INFO layer 22 lwc lac iter 9, lr 0.00125375 time 8.401184s, mse: 0.86960399
447
+ [2026-01-09 10:00:14 root] (train_utils.py 185): INFO layer 22 lwc lac iter 10, lr 0.00083135 time 8.401504s, mse: 0.86433518
448
+ [2026-01-09 10:00:21 root] (train_utils.py 185): INFO layer 22 lwc lac iter 11, lr 0.00048198 time 7.016203s, mse: 0.85831034
449
+ [2026-01-09 10:00:25 root] (train_utils.py 185): INFO layer 22 lwc lac iter 12, lr 0.00022092 time 3.926229s, mse: 0.85434479
450
+ [2026-01-09 10:00:29 root] (train_utils.py 185): INFO layer 22 lwc lac iter 13, lr 0.00005958 time 3.888714s, mse: 0.85274106
451
+ [2026-01-09 10:00:33 root] (train_utils.py 185): INFO layer 22 lwc lac iter 14, lr 0.00000500 time 3.876585s, mse: 0.85105854
452
+ [2026-01-09 10:00:34 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
453
+ [2026-01-09 10:00:34 root] (train_utils.py 108): INFO ========= Layer 23 =========
454
+ [2026-01-09 10:00:46 root] (train_utils.py 185): INFO layer 23 lwc lac iter 0, lr 0.00494542 time 9.105386s, mse: 2.56160784
455
+ [2026-01-09 10:00:54 root] (train_utils.py 185): INFO layer 23 lwc lac iter 1, lr 0.00478408 time 8.433470s, mse: 1.69400561
456
+ [2026-01-09 10:01:02 root] (train_utils.py 185): INFO layer 23 lwc lac iter 2, lr 0.00452302 time 8.508389s, mse: 1.40092814
457
+ [2026-01-09 10:01:11 root] (train_utils.py 185): INFO layer 23 lwc lac iter 3, lr 0.00417365 time 8.423953s, mse: 1.33960748
458
+ [2026-01-09 10:01:19 root] (train_utils.py 185): INFO layer 23 lwc lac iter 4, lr 0.00375125 time 8.422999s, mse: 1.31923652
459
+ [2026-01-09 10:01:25 root] (train_utils.py 185): INFO layer 23 lwc lac iter 5, lr 0.00327427 time 5.676231s, mse: 1.30260742
460
+ [2026-01-09 10:01:29 root] (train_utils.py 185): INFO layer 23 lwc lac iter 6, lr 0.00276356 time 4.424892s, mse: 1.29341400
461
+ [2026-01-09 10:01:34 root] (train_utils.py 185): INFO layer 23 lwc lac iter 7, lr 0.00224144 time 4.521775s, mse: 1.28473794
462
+ [2026-01-09 10:01:38 root] (train_utils.py 185): INFO layer 23 lwc lac iter 8, lr 0.00173073 time 4.509436s, mse: 1.27725101
463
+ [2026-01-09 10:01:43 root] (train_utils.py 185): INFO layer 23 lwc lac iter 9, lr 0.00125375 time 4.399419s, mse: 1.27071691
464
+ [2026-01-09 10:01:47 root] (train_utils.py 185): INFO layer 23 lwc lac iter 10, lr 0.00083135 time 4.038232s, mse: 1.26552820
465
+ [2026-01-09 10:01:51 root] (train_utils.py 185): INFO layer 23 lwc lac iter 11, lr 0.00048198 time 3.967611s, mse: 1.26018000
466
+ [2026-01-09 10:01:55 root] (train_utils.py 185): INFO layer 23 lwc lac iter 12, lr 0.00022092 time 3.895498s, mse: 1.25696874
467
+ [2026-01-09 10:01:59 root] (train_utils.py 185): INFO layer 23 lwc lac iter 13, lr 0.00005958 time 3.883907s, mse: 1.25348544
468
+ [2026-01-09 10:02:06 root] (train_utils.py 185): INFO layer 23 lwc lac iter 14, lr 0.00000500 time 6.941385s, mse: 1.25113153
469
+ [2026-01-09 10:02:06 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
470
+ [2026-01-09 10:02:06 root] (train_utils.py 108): INFO ========= Layer 24 =========
471
+ [2026-01-09 10:02:21 root] (train_utils.py 185): INFO layer 24 lwc lac iter 0, lr 0.00494542 time 10.583919s, mse: 3.33080626
472
+ [2026-01-09 10:02:31 root] (train_utils.py 185): INFO layer 24 lwc lac iter 1, lr 0.00478408 time 9.517836s, mse: 2.21739531
473
+ [2026-01-09 10:02:41 root] (train_utils.py 185): INFO layer 24 lwc lac iter 2, lr 0.00452302 time 9.536418s, mse: 1.83558488
474
+ [2026-01-09 10:02:50 root] (train_utils.py 185): INFO layer 24 lwc lac iter 3, lr 0.00417365 time 9.505691s, mse: 1.75192118
475
+ [2026-01-09 10:03:00 root] (train_utils.py 185): INFO layer 24 lwc lac iter 4, lr 0.00375125 time 9.494667s, mse: 1.73021388
476
+ [2026-01-09 10:03:09 root] (train_utils.py 185): INFO layer 24 lwc lac iter 5, lr 0.00327427 time 9.531198s, mse: 1.70965135
477
+ [2026-01-09 10:03:16 root] (train_utils.py 185): INFO layer 24 lwc lac iter 6, lr 0.00276356 time 7.118713s, mse: 1.69753647
478
+ [2026-01-09 10:03:20 root] (train_utils.py 185): INFO layer 24 lwc lac iter 7, lr 0.00224144 time 4.266183s, mse: 1.68364048
479
+ [2026-01-09 10:03:25 root] (train_utils.py 185): INFO layer 24 lwc lac iter 8, lr 0.00173073 time 4.886819s, mse: 1.67123342
480
+ [2026-01-09 10:03:30 root] (train_utils.py 185): INFO layer 24 lwc lac iter 9, lr 0.00125375 time 5.085745s, mse: 1.66224420
481
+ [2026-01-09 10:03:34 root] (train_utils.py 185): INFO layer 24 lwc lac iter 10, lr 0.00083135 time 4.032805s, mse: 1.65476453
482
+ [2026-01-09 10:03:38 root] (train_utils.py 185): INFO layer 24 lwc lac iter 11, lr 0.00048198 time 3.882242s, mse: 1.64498436
483
+ [2026-01-09 10:03:42 root] (train_utils.py 185): INFO layer 24 lwc lac iter 12, lr 0.00022092 time 3.881795s, mse: 1.63647079
484
+ [2026-01-09 10:03:46 root] (train_utils.py 185): INFO layer 24 lwc lac iter 13, lr 0.00005958 time 3.892410s, mse: 1.63291585
485
+ [2026-01-09 10:03:50 root] (train_utils.py 185): INFO layer 24 lwc lac iter 14, lr 0.00000500 time 3.886352s, mse: 1.63007939
486
+ [2026-01-09 10:03:50 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
487
+ [2026-01-09 10:03:51 root] (train_utils.py 108): INFO ========= Layer 25 =========
488
+ [2026-01-09 10:03:58 root] (train_utils.py 185): INFO layer 25 lwc lac iter 0, lr 0.00494542 time 4.754002s, mse: 3.67945337
489
+ [2026-01-09 10:04:02 root] (train_utils.py 185): INFO layer 25 lwc lac iter 1, lr 0.00478408 time 3.891057s, mse: 2.39840055
490
+ [2026-01-09 10:04:06 root] (train_utils.py 185): INFO layer 25 lwc lac iter 2, lr 0.00452302 time 3.965408s, mse: 2.00158238
491
+ [2026-01-09 10:04:10 root] (train_utils.py 185): INFO layer 25 lwc lac iter 3, lr 0.00417365 time 3.975547s, mse: 1.92655563
492
+ [2026-01-09 10:04:14 root] (train_utils.py 185): INFO layer 25 lwc lac iter 4, lr 0.00375125 time 3.992167s, mse: 1.90741169
493
+ [2026-01-09 10:04:18 root] (train_utils.py 185): INFO layer 25 lwc lac iter 5, lr 0.00327427 time 3.972306s, mse: 1.89064825
494
+ [2026-01-09 10:04:22 root] (train_utils.py 185): INFO layer 25 lwc lac iter 6, lr 0.00276356 time 3.959593s, mse: 1.88254857
495
+ [2026-01-09 10:04:26 root] (train_utils.py 185): INFO layer 25 lwc lac iter 7, lr 0.00224144 time 3.978096s, mse: 1.87189174
496
+ [2026-01-09 10:04:30 root] (train_utils.py 185): INFO layer 25 lwc lac iter 8, lr 0.00173073 time 4.038383s, mse: 1.86226833
497
+ [2026-01-09 10:04:34 root] (train_utils.py 185): INFO layer 25 lwc lac iter 9, lr 0.00125375 time 3.965123s, mse: 1.85414529
498
+ [2026-01-09 10:04:38 root] (train_utils.py 185): INFO layer 25 lwc lac iter 10, lr 0.00083135 time 3.987170s, mse: 1.84632003
499
+ [2026-01-09 10:04:42 root] (train_utils.py 185): INFO layer 25 lwc lac iter 11, lr 0.00048198 time 3.968241s, mse: 1.83962476
500
+ [2026-01-09 10:04:46 root] (train_utils.py 185): INFO layer 25 lwc lac iter 12, lr 0.00022092 time 3.965485s, mse: 1.83272731
501
+ [2026-01-09 10:04:50 root] (train_utils.py 185): INFO layer 25 lwc lac iter 13, lr 0.00005958 time 3.952049s, mse: 1.83188641
502
+ [2026-01-09 10:04:54 root] (train_utils.py 185): INFO layer 25 lwc lac iter 14, lr 0.00000500 time 4.158629s, mse: 1.82856822
503
+ [2026-01-09 10:04:54 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
504
+ [2026-01-09 10:04:55 root] (train_utils.py 108): INFO ========= Layer 26 =========
505
+ [2026-01-09 10:05:03 root] (train_utils.py 185): INFO layer 26 lwc lac iter 0, lr 0.00494542 time 5.414984s, mse: 4.35819054
506
+ [2026-01-09 10:05:08 root] (train_utils.py 185): INFO layer 26 lwc lac iter 1, lr 0.00478408 time 4.391829s, mse: 2.94494462
507
+ [2026-01-09 10:05:12 root] (train_utils.py 185): INFO layer 26 lwc lac iter 2, lr 0.00452302 time 4.448181s, mse: 2.46222878
508
+ [2026-01-09 10:05:16 root] (train_utils.py 185): INFO layer 26 lwc lac iter 3, lr 0.00417365 time 4.035336s, mse: 2.36697221
509
+ [2026-01-09 10:05:20 root] (train_utils.py 185): INFO layer 26 lwc lac iter 4, lr 0.00375125 time 3.881711s, mse: 2.34871936
510
+ [2026-01-09 10:05:27 root] (train_utils.py 185): INFO layer 26 lwc lac iter 5, lr 0.00327427 time 7.229694s, mse: 2.33013940
511
+ [2026-01-09 10:05:35 root] (train_utils.py 185): INFO layer 26 lwc lac iter 6, lr 0.00276356 time 7.588176s, mse: 2.31725478
512
+ [2026-01-09 10:05:42 root] (train_utils.py 185): INFO layer 26 lwc lac iter 7, lr 0.00224144 time 7.586895s, mse: 2.30295658
513
+ [2026-01-09 10:05:50 root] (train_utils.py 185): INFO layer 26 lwc lac iter 8, lr 0.00173073 time 7.570282s, mse: 2.29171467
514
+ [2026-01-09 10:05:57 root] (train_utils.py 185): INFO layer 26 lwc lac iter 9, lr 0.00125375 time 6.883629s, mse: 2.28112888
515
+ [2026-01-09 10:06:01 root] (train_utils.py 185): INFO layer 26 lwc lac iter 10, lr 0.00083135 time 3.904047s, mse: 2.27260423
516
+ [2026-01-09 10:06:05 root] (train_utils.py 185): INFO layer 26 lwc lac iter 11, lr 0.00048198 time 3.889935s, mse: 2.26187754
517
+ [2026-01-09 10:06:12 root] (train_utils.py 185): INFO layer 26 lwc lac iter 12, lr 0.00022092 time 7.026200s, mse: 2.25517917
518
+ [2026-01-09 10:06:19 root] (train_utils.py 185): INFO layer 26 lwc lac iter 13, lr 0.00005958 time 7.208640s, mse: 2.24800634
519
+ [2026-01-09 10:06:26 root] (train_utils.py 185): INFO layer 26 lwc lac iter 14, lr 0.00000500 time 7.211710s, mse: 2.24403787
520
+ [2026-01-09 10:06:27 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
521
+ [2026-01-09 10:06:27 root] (train_utils.py 108): INFO ========= Layer 27 =========
522
+ [2026-01-09 10:06:37 root] (train_utils.py 185): INFO layer 27 lwc lac iter 0, lr 0.00494542 time 6.275254s, mse: 5.94560862
523
+ [2026-01-09 10:06:41 root] (train_utils.py 185): INFO layer 27 lwc lac iter 1, lr 0.00478408 time 3.890866s, mse: 3.95834851
524
+ [2026-01-09 10:06:46 root] (train_utils.py 185): INFO layer 27 lwc lac iter 2, lr 0.00452302 time 4.665193s, mse: 3.32281756
525
+ [2026-01-09 10:06:53 root] (train_utils.py 185): INFO layer 27 lwc lac iter 3, lr 0.00417365 time 7.189707s, mse: 3.18086267
526
+ [2026-01-09 10:07:00 root] (train_utils.py 185): INFO layer 27 lwc lac iter 4, lr 0.00375125 time 7.197548s, mse: 3.14467168
527
+ [2026-01-09 10:07:07 root] (train_utils.py 185): INFO layer 27 lwc lac iter 5, lr 0.00327427 time 7.219765s, mse: 3.12000346
528
+ [2026-01-09 10:07:15 root] (train_utils.py 185): INFO layer 27 lwc lac iter 6, lr 0.00276356 time 7.202462s, mse: 3.09776139
529
+ [2026-01-09 10:07:20 root] (train_utils.py 185): INFO layer 27 lwc lac iter 7, lr 0.00224144 time 5.093818s, mse: 3.07834363
530
+ [2026-01-09 10:07:25 root] (train_utils.py 185): INFO layer 27 lwc lac iter 8, lr 0.00173073 time 5.226724s, mse: 3.06277657
531
+ [2026-01-09 10:07:30 root] (train_utils.py 185): INFO layer 27 lwc lac iter 9, lr 0.00125375 time 4.802370s, mse: 3.04591680
532
+ [2026-01-09 10:07:34 root] (train_utils.py 185): INFO layer 27 lwc lac iter 10, lr 0.00083135 time 3.894957s, mse: 3.03134632
533
+ [2026-01-09 10:07:40 root] (train_utils.py 185): INFO layer 27 lwc lac iter 11, lr 0.00048198 time 6.565792s, mse: 3.01916480
534
+ [2026-01-09 10:07:48 root] (train_utils.py 185): INFO layer 27 lwc lac iter 12, lr 0.00022092 time 7.576006s, mse: 3.00719571
535
+ [2026-01-09 10:07:55 root] (train_utils.py 185): INFO layer 27 lwc lac iter 13, lr 0.00005958 time 7.576934s, mse: 2.99984956
536
+ [2026-01-09 10:08:03 root] (train_utils.py 185): INFO layer 27 lwc lac iter 14, lr 0.00000500 time 7.584801s, mse: 2.99120903
537
+ [2026-01-09 10:08:03 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
538
+ [2026-01-09 10:08:04 root] (train_utils.py 108): INFO ========= Layer 28 =========
539
+ [2026-01-09 10:08:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 0, lr 0.00494542 time 5.645861s, mse: 8.40579605
540
+ [2026-01-09 10:08:18 root] (train_utils.py 185): INFO layer 28 lwc lac iter 1, lr 0.00478408 time 4.716225s, mse: 5.55529737
541
+ [2026-01-09 10:08:23 root] (train_utils.py 185): INFO layer 28 lwc lac iter 2, lr 0.00452302 time 4.781032s, mse: 4.64479589
542
+ [2026-01-09 10:08:27 root] (train_utils.py 185): INFO layer 28 lwc lac iter 3, lr 0.00417365 time 4.215279s, mse: 4.46341419
543
+ [2026-01-09 10:08:31 root] (train_utils.py 185): INFO layer 28 lwc lac iter 4, lr 0.00375125 time 3.945424s, mse: 4.40386772
544
+ [2026-01-09 10:08:35 root] (train_utils.py 185): INFO layer 28 lwc lac iter 5, lr 0.00327427 time 3.895538s, mse: 4.37245226
545
+ [2026-01-09 10:08:39 root] (train_utils.py 185): INFO layer 28 lwc lac iter 6, lr 0.00276356 time 4.438238s, mse: 4.34240580
546
+ [2026-01-09 10:08:48 root] (train_utils.py 185): INFO layer 28 lwc lac iter 7, lr 0.00224144 time 8.389002s, mse: 4.31763363
547
+ [2026-01-09 10:08:56 root] (train_utils.py 185): INFO layer 28 lwc lac iter 8, lr 0.00173073 time 8.383581s, mse: 4.29854107
548
+ [2026-01-09 10:09:05 root] (train_utils.py 185): INFO layer 28 lwc lac iter 9, lr 0.00125375 time 8.399694s, mse: 4.28071547
549
+ [2026-01-09 10:09:13 root] (train_utils.py 185): INFO layer 28 lwc lac iter 10, lr 0.00083135 time 8.417864s, mse: 4.26679897
550
+ [2026-01-09 10:09:21 root] (train_utils.py 185): INFO layer 28 lwc lac iter 11, lr 0.00048198 time 8.410913s, mse: 4.24268007
551
+ [2026-01-09 10:09:26 root] (train_utils.py 185): INFO layer 28 lwc lac iter 12, lr 0.00022092 time 5.137883s, mse: 4.22641373
552
+ [2026-01-09 10:09:30 root] (train_utils.py 185): INFO layer 28 lwc lac iter 13, lr 0.00005958 time 3.939935s, mse: 4.22128248
553
+ [2026-01-09 10:09:34 root] (train_utils.py 185): INFO layer 28 lwc lac iter 14, lr 0.00000500 time 3.886578s, mse: 4.21494389
554
+ [2026-01-09 10:09:35 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
555
+ [2026-01-09 10:09:35 root] (train_utils.py 108): INFO ========= Layer 29 =========
556
+ [2026-01-09 10:09:46 root] (train_utils.py 185): INFO layer 29 lwc lac iter 0, lr 0.00494542 time 7.813271s, mse: 10.38746834
557
+ [2026-01-09 10:09:54 root] (train_utils.py 185): INFO layer 29 lwc lac iter 1, lr 0.00478408 time 8.429181s, mse: 7.14648628
558
+ [2026-01-09 10:10:02 root] (train_utils.py 185): INFO layer 29 lwc lac iter 2, lr 0.00452302 time 8.383594s, mse: 6.03318691
559
+ [2026-01-09 10:10:11 root] (train_utils.py 185): INFO layer 29 lwc lac iter 3, lr 0.00417365 time 8.417278s, mse: 5.78764057
560
+ [2026-01-09 10:10:19 root] (train_utils.py 185): INFO layer 29 lwc lac iter 4, lr 0.00375125 time 8.413698s, mse: 5.71550655
561
+ [2026-01-09 10:10:26 root] (train_utils.py 185): INFO layer 29 lwc lac iter 5, lr 0.00327427 time 7.294332s, mse: 5.66473246
562
+ [2026-01-09 10:10:31 root] (train_utils.py 185): INFO layer 29 lwc lac iter 6, lr 0.00276356 time 4.335289s, mse: 5.61916113
563
+ [2026-01-09 10:10:35 root] (train_utils.py 185): INFO layer 29 lwc lac iter 7, lr 0.00224144 time 4.477592s, mse: 5.58458805
564
+ [2026-01-09 10:10:40 root] (train_utils.py 185): INFO layer 29 lwc lac iter 8, lr 0.00173073 time 4.570168s, mse: 5.54784393
565
+ [2026-01-09 10:10:44 root] (train_utils.py 185): INFO layer 29 lwc lac iter 9, lr 0.00125375 time 4.581471s, mse: 5.52231646
566
+ [2026-01-09 10:10:49 root] (train_utils.py 185): INFO layer 29 lwc lac iter 10, lr 0.00083135 time 4.188355s, mse: 5.48976994
567
+ [2026-01-09 10:10:52 root] (train_utils.py 185): INFO layer 29 lwc lac iter 11, lr 0.00048198 time 3.879935s, mse: 5.46507311
568
+ [2026-01-09 10:10:56 root] (train_utils.py 185): INFO layer 29 lwc lac iter 12, lr 0.00022092 time 3.884995s, mse: 5.44575977
569
+ [2026-01-09 10:11:03 root] (train_utils.py 185): INFO layer 29 lwc lac iter 13, lr 0.00005958 time 6.919443s, mse: 5.43577242
570
+ [2026-01-09 10:11:13 root] (train_utils.py 185): INFO layer 29 lwc lac iter 14, lr 0.00000500 time 9.446140s, mse: 5.42604542
571
+ [2026-01-09 10:11:13 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
572
+ [2026-01-09 10:11:14 root] (train_utils.py 108): INFO ========= Layer 30 =========
573
+ [2026-01-09 10:11:29 root] (train_utils.py 185): INFO layer 30 lwc lac iter 0, lr 0.00494542 time 10.467985s, mse: 16.29405975
574
+ [2026-01-09 10:11:38 root] (train_utils.py 185): INFO layer 30 lwc lac iter 1, lr 0.00478408 time 9.468451s, mse: 11.01632500
575
+ [2026-01-09 10:11:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 2, lr 0.00452302 time 9.468787s, mse: 9.27882481
576
+ [2026-01-09 10:11:57 root] (train_utils.py 185): INFO layer 30 lwc lac iter 3, lr 0.00417365 time 9.456441s, mse: 8.87542439
577
+ [2026-01-09 10:12:07 root] (train_utils.py 185): INFO layer 30 lwc lac iter 4, lr 0.00375125 time 9.510264s, mse: 8.75351048
578
+ [2026-01-09 10:12:14 root] (train_utils.py 185): INFO layer 30 lwc lac iter 5, lr 0.00327427 time 7.368535s, mse: 8.65880680
579
+ [2026-01-09 10:12:18 root] (train_utils.py 185): INFO layer 30 lwc lac iter 6, lr 0.00276356 time 4.237104s, mse: 8.60634327
580
+ [2026-01-09 10:12:23 root] (train_utils.py 185): INFO layer 30 lwc lac iter 7, lr 0.00224144 time 4.838456s, mse: 8.53597736
581
+ [2026-01-09 10:12:28 root] (train_utils.py 185): INFO layer 30 lwc lac iter 8, lr 0.00173073 time 5.074469s, mse: 8.50352001
582
+ [2026-01-09 10:12:32 root] (train_utils.py 185): INFO layer 30 lwc lac iter 9, lr 0.00125375 time 4.104348s, mse: 8.44190311
583
+ [2026-01-09 10:12:36 root] (train_utils.py 185): INFO layer 30 lwc lac iter 10, lr 0.00083135 time 3.886484s, mse: 8.40491486
584
+ [2026-01-09 10:12:40 root] (train_utils.py 185): INFO layer 30 lwc lac iter 11, lr 0.00048198 time 3.886079s, mse: 8.38511753
585
+ [2026-01-09 10:12:44 root] (train_utils.py 185): INFO layer 30 lwc lac iter 12, lr 0.00022092 time 3.892271s, mse: 8.35692787
586
+ [2026-01-09 10:12:48 root] (train_utils.py 185): INFO layer 30 lwc lac iter 13, lr 0.00005958 time 3.886010s, mse: 8.35674667
587
+ [2026-01-09 10:12:52 root] (train_utils.py 185): INFO layer 30 lwc lac iter 14, lr 0.00000500 time 3.880266s, mse: 8.34408569
588
+ [2026-01-09 10:12:52 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
589
+ [2026-01-09 10:12:52 root] (train_utils.py 108): INFO ========= Layer 31 =========
590
+ [2026-01-09 10:13:00 root] (train_utils.py 185): INFO layer 31 lwc lac iter 0, lr 0.00494542 time 4.802090s, mse: 20.78250885
591
+ [2026-01-09 10:13:04 root] (train_utils.py 185): INFO layer 31 lwc lac iter 1, lr 0.00478408 time 4.032989s, mse: 14.37235165
592
+ [2026-01-09 10:13:08 root] (train_utils.py 185): INFO layer 31 lwc lac iter 2, lr 0.00452302 time 3.989193s, mse: 12.13233566
593
+ [2026-01-09 10:13:12 root] (train_utils.py 185): INFO layer 31 lwc lac iter 3, lr 0.00417365 time 3.970271s, mse: 11.62570667
594
+ [2026-01-09 10:13:16 root] (train_utils.py 185): INFO layer 31 lwc lac iter 4, lr 0.00375125 time 3.965022s, mse: 11.51362991
595
+ [2026-01-09 10:13:20 root] (train_utils.py 185): INFO layer 31 lwc lac iter 5, lr 0.00327427 time 3.978235s, mse: 11.42485142
596
+ [2026-01-09 10:13:24 root] (train_utils.py 185): INFO layer 31 lwc lac iter 6, lr 0.00276356 time 3.956572s, mse: 11.33607769
597
+ [2026-01-09 10:13:28 root] (train_utils.py 185): INFO layer 31 lwc lac iter 7, lr 0.00224144 time 3.983628s, mse: 11.27843571
598
+ [2026-01-09 10:13:32 root] (train_utils.py 185): INFO layer 31 lwc lac iter 8, lr 0.00173073 time 3.971437s, mse: 11.22037888
599
+ [2026-01-09 10:13:36 root] (train_utils.py 185): INFO layer 31 lwc lac iter 9, lr 0.00125375 time 3.981398s, mse: 11.15839195
600
+ [2026-01-09 10:13:40 root] (train_utils.py 185): INFO layer 31 lwc lac iter 10, lr 0.00083135 time 3.970976s, mse: 11.12734127
601
+ [2026-01-09 10:13:43 root] (train_utils.py 185): INFO layer 31 lwc lac iter 11, lr 0.00048198 time 3.968406s, mse: 11.08810806
602
+ [2026-01-09 10:13:47 root] (train_utils.py 185): INFO layer 31 lwc lac iter 12, lr 0.00022092 time 3.966583s, mse: 11.05513668
603
+ [2026-01-09 10:13:52 root] (train_utils.py 185): INFO layer 31 lwc lac iter 13, lr 0.00005958 time 4.241404s, mse: 11.03436947
604
+ [2026-01-09 10:13:56 root] (train_utils.py 185): INFO layer 31 lwc lac iter 14, lr 0.00000500 time 4.397378s, mse: 11.01393795
605
+ [2026-01-09 10:13:57 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
606
+ [2026-01-09 10:13:57 root] (train_utils.py 108): INFO ========= Layer 32 =========
607
+ [2026-01-09 10:14:05 root] (train_utils.py 185): INFO layer 32 lwc lac iter 0, lr 0.00494542 time 5.240127s, mse: 28.37956429
608
+ [2026-01-09 10:14:10 root] (train_utils.py 185): INFO layer 32 lwc lac iter 1, lr 0.00478408 time 4.407370s, mse: 19.76789856
609
+ [2026-01-09 10:14:14 root] (train_utils.py 185): INFO layer 32 lwc lac iter 2, lr 0.00452302 time 4.113342s, mse: 16.61169624
610
+ [2026-01-09 10:14:18 root] (train_utils.py 185): INFO layer 32 lwc lac iter 3, lr 0.00417365 time 3.878139s, mse: 15.88970184
611
+ [2026-01-09 10:14:24 root] (train_utils.py 185): INFO layer 32 lwc lac iter 4, lr 0.00375125 time 6.959631s, mse: 15.74769402
612
+ [2026-01-09 10:14:32 root] (train_utils.py 185): INFO layer 32 lwc lac iter 5, lr 0.00327427 time 7.576349s, mse: 15.61922455
613
+ [2026-01-09 10:14:40 root] (train_utils.py 185): INFO layer 32 lwc lac iter 6, lr 0.00276356 time 7.601277s, mse: 15.51004982
614
+ [2026-01-09 10:14:47 root] (train_utils.py 185): INFO layer 32 lwc lac iter 7, lr 0.00224144 time 7.561176s, mse: 15.42904854
615
+ [2026-01-09 10:14:54 root] (train_utils.py 185): INFO layer 32 lwc lac iter 8, lr 0.00173073 time 7.163068s, mse: 15.34880447
616
+ [2026-01-09 10:14:58 root] (train_utils.py 185): INFO layer 32 lwc lac iter 9, lr 0.00125375 time 3.901019s, mse: 15.27359772
617
+ [2026-01-09 10:15:02 root] (train_utils.py 185): INFO layer 32 lwc lac iter 10, lr 0.00083135 time 3.876441s, mse: 15.21441174
618
+ [2026-01-09 10:15:09 root] (train_utils.py 185): INFO layer 32 lwc lac iter 11, lr 0.00048198 time 6.410050s, mse: 15.16252708
619
+ [2026-01-09 10:15:16 root] (train_utils.py 185): INFO layer 32 lwc lac iter 12, lr 0.00022092 time 7.196454s, mse: 15.10843849
620
+ [2026-01-09 10:15:23 root] (train_utils.py 185): INFO layer 32 lwc lac iter 13, lr 0.00005958 time 7.212720s, mse: 15.08382893
621
+ [2026-01-09 10:15:30 root] (train_utils.py 185): INFO layer 32 lwc lac iter 14, lr 0.00000500 time 7.231273s, mse: 15.06546974
622
+ [2026-01-09 10:15:31 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
623
+ [2026-01-09 10:15:31 root] (train_utils.py 108): INFO ========= Layer 33 =========
624
+ [2026-01-09 10:15:39 root] (train_utils.py 185): INFO layer 33 lwc lac iter 0, lr 0.00494542 time 4.706686s, mse: 41.54327011
625
+ [2026-01-09 10:15:44 root] (train_utils.py 185): INFO layer 33 lwc lac iter 1, lr 0.00478408 time 4.436514s, mse: 27.93664551
626
+ [2026-01-09 10:15:51 root] (train_utils.py 185): INFO layer 33 lwc lac iter 2, lr 0.00452302 time 7.198779s, mse: 23.32941628
627
+ [2026-01-09 10:15:58 root] (train_utils.py 185): INFO layer 33 lwc lac iter 3, lr 0.00417365 time 7.216425s, mse: 22.34293175
628
+ [2026-01-09 10:16:05 root] (train_utils.py 185): INFO layer 33 lwc lac iter 4, lr 0.00375125 time 7.222544s, mse: 22.07669640
629
+ [2026-01-09 10:16:12 root] (train_utils.py 185): INFO layer 33 lwc lac iter 5, lr 0.00327427 time 7.230317s, mse: 21.87960243
630
+ [2026-01-09 10:16:18 root] (train_utils.py 185): INFO layer 33 lwc lac iter 6, lr 0.00276356 time 5.261297s, mse: 21.73635674
631
+ [2026-01-09 10:16:23 root] (train_utils.py 185): INFO layer 33 lwc lac iter 7, lr 0.00224144 time 5.232856s, mse: 21.58724403
632
+ [2026-01-09 10:16:28 root] (train_utils.py 185): INFO layer 33 lwc lac iter 8, lr 0.00173073 time 4.793520s, mse: 21.46766853
633
+ [2026-01-09 10:16:32 root] (train_utils.py 185): INFO layer 33 lwc lac iter 9, lr 0.00125375 time 3.886966s, mse: 21.36098099
634
+ [2026-01-09 10:16:37 root] (train_utils.py 185): INFO layer 33 lwc lac iter 10, lr 0.00083135 time 5.417264s, mse: 21.27636719
635
+ [2026-01-09 10:16:45 root] (train_utils.py 185): INFO layer 33 lwc lac iter 11, lr 0.00048198 time 7.600954s, mse: 21.16030693
636
+ [2026-01-09 10:16:52 root] (train_utils.py 185): INFO layer 33 lwc lac iter 12, lr 0.00022092 time 7.574526s, mse: 21.07536125
637
+ [2026-01-09 10:17:00 root] (train_utils.py 185): INFO layer 33 lwc lac iter 13, lr 0.00005958 time 7.584183s, mse: 20.99114990
638
+ [2026-01-09 10:17:07 root] (train_utils.py 185): INFO layer 33 lwc lac iter 14, lr 0.00000500 time 7.586675s, mse: 20.95961761
639
+ [2026-01-09 10:17:08 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
640
+ [2026-01-09 10:17:08 root] (train_utils.py 108): INFO ========= Layer 34 =========
641
+ [2026-01-09 10:17:17 root] (train_utils.py 185): INFO layer 34 lwc lac iter 0, lr 0.00494542 time 5.584197s, mse: 64.93594360
642
+ [2026-01-09 10:17:22 root] (train_utils.py 185): INFO layer 34 lwc lac iter 1, lr 0.00478408 time 4.842231s, mse: 40.86461258
643
+ [2026-01-09 10:17:26 root] (train_utils.py 185): INFO layer 34 lwc lac iter 2, lr 0.00452302 time 4.336210s, mse: 33.65349960
644
+ [2026-01-09 10:17:30 root] (train_utils.py 185): INFO layer 34 lwc lac iter 3, lr 0.00417365 time 3.875152s, mse: 31.96302605
645
+ [2026-01-09 10:17:34 root] (train_utils.py 185): INFO layer 34 lwc lac iter 4, lr 0.00375125 time 3.886546s, mse: 31.66926384
646
+ [2026-01-09 10:17:39 root] (train_utils.py 185): INFO layer 34 lwc lac iter 5, lr 0.00327427 time 5.056141s, mse: 31.07656479
647
+ [2026-01-09 10:17:47 root] (train_utils.py 185): INFO layer 34 lwc lac iter 6, lr 0.00276356 time 8.413598s, mse: 30.91048813
648
+ [2026-01-09 10:17:56 root] (train_utils.py 185): INFO layer 34 lwc lac iter 7, lr 0.00224144 time 8.436667s, mse: 30.05115700
649
+ [2026-01-09 10:18:04 root] (train_utils.py 185): INFO layer 34 lwc lac iter 8, lr 0.00173073 time 8.411937s, mse: 29.89023590
650
+ [2026-01-09 10:18:12 root] (train_utils.py 185): INFO layer 34 lwc lac iter 9, lr 0.00125375 time 8.405565s, mse: 30.35319901
651
+ [2026-01-09 10:18:21 root] (train_utils.py 185): INFO layer 34 lwc lac iter 10, lr 0.00083135 time 8.418024s, mse: 29.46559715
652
+ [2026-01-09 10:18:25 root] (train_utils.py 185): INFO layer 34 lwc lac iter 11, lr 0.00048198 time 4.597981s, mse: 29.05239487
653
+ [2026-01-09 10:18:29 root] (train_utils.py 185): INFO layer 34 lwc lac iter 12, lr 0.00022092 time 3.877557s, mse: 28.86521339
654
+ [2026-01-09 10:18:33 root] (train_utils.py 185): INFO layer 34 lwc lac iter 13, lr 0.00005958 time 3.883606s, mse: 28.74409676
655
+ [2026-01-09 10:18:37 root] (train_utils.py 185): INFO layer 34 lwc lac iter 14, lr 0.00000500 time 4.302986s, mse: 28.70412636
656
+ [2026-01-09 10:18:38 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
657
+ [2026-01-09 10:18:38 root] (train_utils.py 108): INFO ========= Layer 35 =========
658
+ [2026-01-09 10:18:52 root] (train_utils.py 185): INFO layer 35 lwc lac iter 0, lr 0.00494542 time 9.411074s, mse: 108.25781250
659
+ [2026-01-09 10:19:00 root] (train_utils.py 185): INFO layer 35 lwc lac iter 1, lr 0.00478408 time 8.419429s, mse: 38.04971313
660
+ [2026-01-09 10:19:09 root] (train_utils.py 185): INFO layer 35 lwc lac iter 2, lr 0.00452302 time 8.463986s, mse: 31.63025665
661
+ [2026-01-09 10:19:17 root] (train_utils.py 185): INFO layer 35 lwc lac iter 3, lr 0.00417365 time 8.413347s, mse: 29.21376991
662
+ [2026-01-09 10:19:23 root] (train_utils.py 185): INFO layer 35 lwc lac iter 4, lr 0.00375125 time 6.243543s, mse: 28.19089508
663
+ [2026-01-09 10:19:28 root] (train_utils.py 185): INFO layer 35 lwc lac iter 5, lr 0.00327427 time 4.387824s, mse: 28.40728760
664
+ [2026-01-09 10:19:32 root] (train_utils.py 185): INFO layer 35 lwc lac iter 6, lr 0.00276356 time 4.512519s, mse: 27.74842644
665
+ [2026-01-09 10:19:37 root] (train_utils.py 185): INFO layer 35 lwc lac iter 7, lr 0.00224144 time 4.493203s, mse: 27.13273811
666
+ [2026-01-09 10:19:41 root] (train_utils.py 185): INFO layer 35 lwc lac iter 8, lr 0.00173073 time 4.561685s, mse: 26.53238487
667
+ [2026-01-09 10:19:46 root] (train_utils.py 185): INFO layer 35 lwc lac iter 9, lr 0.00125375 time 4.119229s, mse: 26.14052200
668
+ [2026-01-09 10:19:49 root] (train_utils.py 185): INFO layer 35 lwc lac iter 10, lr 0.00083135 time 3.929297s, mse: 25.63203621
669
+ [2026-01-09 10:19:53 root] (train_utils.py 185): INFO layer 35 lwc lac iter 11, lr 0.00048198 time 3.883827s, mse: 25.35079384
670
+ [2026-01-09 10:20:01 root] (train_utils.py 185): INFO layer 35 lwc lac iter 12, lr 0.00022092 time 7.978094s, mse: 25.21109390
671
+ [2026-01-09 10:20:11 root] (train_utils.py 185): INFO layer 35 lwc lac iter 13, lr 0.00005958 time 9.496463s, mse: 24.95710945
672
+ [2026-01-09 10:20:20 root] (train_utils.py 185): INFO layer 35 lwc lac iter 14, lr 0.00000500 time 9.466582s, mse: 24.85692596
673
+ [2026-01-09 10:20:21 root] (train_utils.py 191): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_parameters.pth
674
+ [2026-01-09 10:21:06 root] (main.py 39): INFO Finished reparameterize model.
675
+ [2026-01-09 10:21:27 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.27 -> 0.25 GB (-0.02 GB)
676
+ [2026-01-09 10:21:50 root] (flat_utils.py 231): INFO saved weights at ./outputs/Qwen3-8B/w4a4/exp
677
+ [2026-01-09 10:22:00 root] (main.py 60): INFO wikitext2
678
+ [2026-01-09 10:22:44 root] (main.py 69): INFO 10.263629913330078
679
+ [2026-01-09 10:22:44 root] (main.py 60): INFO c4
680
+ [2026-01-09 10:24:57 root] (main.py 69): INFO 16.17665672302246
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_062728.txt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 06:27:28 root] (args_utils.py 168): INFO Arguments:
2
+ [2026-01-12 06:27:28 root] (args_utils.py 169): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': True,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': False,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-12 06:27:28 root] (args_utils.py 170): INFO ------------------------------------------------------------
59
+ [2026-01-12 06:27:29 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-12 06:27:50 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-12 06:27:55 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-12 06:28:38 root] (main.py 39): INFO Finished reparameterize model.
63
+ [2026-01-12 06:29:06 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_063624.txt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 06:36:24 root] (args_utils.py 168): INFO Arguments:
2
+ [2026-01-12 06:36:24 root] (args_utils.py 169): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'seed': 0,
44
+ 'separate_vtrans': False,
45
+ 'tasks': ['piqa',
46
+ 'hellaswag',
47
+ 'arc_easy',
48
+ 'arc_challenge',
49
+ 'winogrande',
50
+ 'lambada_openai'],
51
+ 'v_asym': False,
52
+ 'v_bits': 16,
53
+ 'v_groupsize': -1,
54
+ 'w_asym': False,
55
+ 'w_bits': 4,
56
+ 'w_groupsize': 128,
57
+ 'warmup': False}
58
+ [2026-01-12 06:36:24 root] (args_utils.py 170): INFO ------------------------------------------------------------
59
+ [2026-01-12 06:36:25 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
60
+ [2026-01-12 06:36:43 root] (main.py 25): INFO Finished loading training data.
61
+ [2026-01-12 06:36:49 root] (main.py 29): INFO Finished applying FlatQuant to model.
62
+ [2026-01-12 06:36:53 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
63
+ [2026-01-12 06:37:33 root] (main.py 39): INFO Finished reparameterize model.
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_155601.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 15:56:01 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 15:56:01 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 15:56:01 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 15:56:02 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 15:56:23 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 15:56:30 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 15:56:34 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 15:57:23 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 15:58:02 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_160154.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 16:01:54 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 16:01:54 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 16:01:54 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 16:01:55 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 16:02:13 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 16:02:21 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 16:02:25 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 16:03:15 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 16:03:53 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
66
+ [2026-01-12 16:03:56 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
67
+ [2026-01-12 16:04:16 root] (main.py 98): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
68
+ [2026-01-12 16:04:30 root] (main.py 107): INFO wikitext2
69
+ [2026-01-12 16:09:16 root] (main.py 116): INFO 10.263629913330078
70
+ [2026-01-12 16:09:16 root] (main.py 107): INFO c4
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_163532.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 16:35:32 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 16:35:32 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 16:35:32 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 16:35:33 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 16:35:51 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 16:35:57 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 16:36:00 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 16:36:45 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 16:37:16 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
66
+ [2026-01-12 16:37:18 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
67
+ [2026-01-12 16:37:45 root] (main.py 145): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
68
+ [2026-01-12 16:37:57 root] (main.py 154): INFO wikitext2
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173005.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 17:30:05 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 17:30:05 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 17:30:05 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 17:30:06 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 17:30:26 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 17:30:33 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 17:30:37 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 17:31:19 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 17:31:54 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
66
+ [2026-01-12 17:31:56 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
67
+ [2026-01-12 17:32:19 root] (main.py 98): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
68
+ [2026-01-12 17:32:31 root] (main.py 107): INFO wikitext2
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173513.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 17:35:13 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 17:35:13 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 17:35:13 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 17:35:14 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 17:35:32 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 17:35:38 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 17:35:43 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 17:36:33 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 17:37:07 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_173832.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 17:38:32 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 17:38:32 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 17:38:32 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 17:38:32 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 17:38:51 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 17:38:59 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 17:39:02 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 17:39:44 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 17:40:16 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
outputs/Qwen3-8B/w4a4/exp/log_rank0_20260112_181953.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-12 18:19:53 root] (args_utils.py 169): INFO Arguments:
2
+ [2026-01-12 18:19:53 root] (args_utils.py 170): INFO {'a_asym': False,
3
+ 'a_bits': 4,
4
+ 'a_groupsize': 128,
5
+ 'act_order': False,
6
+ 'add_diag': True,
7
+ 'cali_bsz': 4,
8
+ 'cali_dataset': 'wikitext2',
9
+ 'cali_trans': True,
10
+ 'deactive_amp': False,
11
+ 'diag_alpha': 0.3,
12
+ 'diag_init': 'sq_style',
13
+ 'direct_inv': False,
14
+ 'distribute_model': False,
15
+ 'epochs': 15,
16
+ 'exp_dir': './outputs/Qwen3-8B/w4a4/exp',
17
+ 'exp_name': 'exp',
18
+ 'flat_lr': 0.005,
19
+ 'gptq': False,
20
+ 'gptq_mse': False,
21
+ 'hf_token': None,
22
+ 'k_asym': False,
23
+ 'k_bits': 16,
24
+ 'k_groupsize': -1,
25
+ 'lac': True,
26
+ 'lm_eval': False,
27
+ 'lm_eval_batch_size': 128,
28
+ 'lwc': True,
29
+ 'matrix_path': None,
30
+ 'model': 'Qwen/Qwen3-8B',
31
+ 'model_name': 'Qwen3-8B',
32
+ 'nsamples': 128,
33
+ 'output_dir': './outputs',
34
+ 'percdamp': 0.01,
35
+ 'q_asym': False,
36
+ 'q_bits': 16,
37
+ 'q_groupsize': -1,
38
+ 'quantize': True,
39
+ 'quantized_save': False,
40
+ 'reload_matrix': False,
41
+ 'resume': True,
42
+ 'save_matrix': True,
43
+ 'save_qmodel_path': './qmodel/Qwen3-8B',
44
+ 'seed': 0,
45
+ 'separate_vtrans': False,
46
+ 'tasks': ['piqa',
47
+ 'hellaswag',
48
+ 'arc_easy',
49
+ 'arc_challenge',
50
+ 'winogrande',
51
+ 'lambada_openai'],
52
+ 'v_asym': False,
53
+ 'v_bits': 16,
54
+ 'v_groupsize': -1,
55
+ 'w_asym': False,
56
+ 'w_bits': 4,
57
+ 'w_groupsize': 128,
58
+ 'warmup': False}
59
+ [2026-01-12 18:19:53 root] (args_utils.py 171): INFO ------------------------------------------------------------
60
+ [2026-01-12 18:19:54 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
61
+ [2026-01-12 18:20:13 root] (main.py 25): INFO Finished loading training data.
62
+ [2026-01-12 18:20:20 root] (main.py 29): INFO Finished applying FlatQuant to model.
63
+ [2026-01-12 18:20:24 root] (flat_utils.py 80): INFO saved paramaters at ./outputs/Qwen3-8B/w4a4/exp/flat_matrices.pth
64
+ [2026-01-12 18:21:14 root] (main.py 39): INFO Finished reparameterize model.
65
+ [2026-01-12 18:21:59 root] (utils.py 48): INFO GPU memory (from rtn_fwrd): 0.10 -> 0.10 GB (0.00 GB)
66
+ [2026-01-12 18:22:02 root] (model_utils.py 83): INFO ---> Loading Qwen/Qwen3-8B Model with seq_len: 2048
67
+ [2026-01-12 18:22:27 root] (main.py 104): INFO Quantized model for Qwen3 saved at ./qmodel/Qwen3-8B.
68
+ [2026-01-12 18:22:44 root] (main.py 113): INFO wikitext2
outputs/Qwen3-8B/w4a4/exp/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31756e3da4017b19338662bd1e6ea8d157c287c4d303910f92c897188a79399a
3
+ size 4734049352
outputs/Qwen3-8B/w4a4/exp/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf1344bde98672393a75f1eac7d77cd94eb9324696c83fb9d771278c7bbf9b52
3
+ size 1461808272
outputs/Qwen3-8B/w4a4/exp/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
outputs/Qwen3-8B/w4a4/exp/quantization_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "w_bits": 4,
3
+ "model_name": "Qwen/Qwen3-8B",
4
+ "symmetric": true,
5
+ "format": "packed_int4",
6
+ "sharded": true
7
+ }
outputs/Qwen3-8B/w4a4/exp/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
outputs/Qwen3-8B/w4a4/exp/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }