elichen-skymizer commited on
Commit
84c5656
·
1 Parent(s): 68feba8

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T20-10-17.511223.json +133 -0
  2. results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T20-44-05.253280.json +132 -0
  3. results/gemma-3-1b-pt-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T20-05-03.343796.json +0 -0
  4. results/gemma-3-1b-pt-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T20-46-37.211472.json +130 -0
  5. results/gemma-3-1b-pt-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T21-10-30.485409.json +137 -0
  6. results/gemma-3-1b-pt-q3_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-23-37.420421.json +133 -0
  7. results/gemma-3-1b-pt-q3_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-57-37.974497.json +132 -0
  8. results/gemma-3-1b-pt-q3_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-18-15.608165.json +0 -0
  9. results/gemma-3-1b-pt-q3_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-00-16.738180.json +130 -0
  10. results/gemma-3-1b-pt-q3_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-24-50.975707.json +137 -0
  11. results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T18-44-50.062799.json +133 -0
  12. results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T19-20-45.909443.json +132 -0
  13. results/gemma-3-1b-pt-q4_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T18-39-32.962297.json +0 -0
  14. results/gemma-3-1b-pt-q4_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T19-23-19.234939.json +130 -0
  15. results/gemma-3-1b-pt-q4_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T19-47-21.865123.json +137 -0
  16. results/gemma-3-1b-pt-q4_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-00-04.167957.json +133 -0
  17. results/gemma-3-1b-pt-q4_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-34-47.776962.json +132 -0
  18. results/gemma-3-1b-pt-q4_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T09-53-47.483830.json +0 -0
  19. results/gemma-3-1b-pt-q4_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-37-24.805509.json +130 -0
  20. results/gemma-3-1b-pt-q4_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-01-05.515541.json +137 -0
  21. results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T12-48-45.405550.json +133 -0
  22. results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T13-22-33.247468.json +132 -0
  23. results/gemma-3-1b-pt-q5_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T12-43-27.880531.json +0 -0
  24. results/gemma-3-1b-pt-q5_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T13-25-06.984222.json +130 -0
  25. results/gemma-3-1b-pt-q5_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T13-48-42.483459.json +137 -0
  26. results/gemma-3-1b-pt-q5_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-06-27.477647.json +133 -0
  27. results/gemma-3-1b-pt-q5_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-40-40.091612.json +132 -0
  28. results/gemma-3-1b-pt-q5_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-00-57.878146.json +0 -0
  29. results/gemma-3-1b-pt-q5_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-43-18.778488.json +130 -0
  30. results/gemma-3-1b-pt-q5_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T05-08-03.555978.json +137 -0
  31. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T04-51-36.078167.json +133 -0
  32. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-28T23-45-08.122514.json +133 -0
  33. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T03-38-59.589582.json +132 -0
  34. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T13-45-33.534718.json +141 -0
  35. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-28T23-14-12.467699.json +0 -0
  36. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T03-45-45.219594.json +130 -0
  37. results/llama-3.1-8b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T04-41-01.642818.json +137 -0
  38. results/llama-3.1-8b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-19-24.887264.json +133 -0
  39. results/llama-3.1-8b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T11-17-57.196185.json +133 -0
  40. results/llama-3.1-8b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-11-18.604003.json +132 -0
  41. results/llama-3.1-8b-instruct-q3_k_m/ifeval/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-29T11-43-19.960215.json +141 -0
  42. results/llama-3.1-8b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T10-49-52.307915.json +0 -0
  43. results/llama-3.1-8b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-17-14.136330.json +130 -0
  44. results/llama-3.1-8b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-11-09.665476.json +137 -0
  45. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T10-43-39.403807.json +133 -0
  46. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T09-23-04.950976.json +133 -0
  47. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T10-13-20.039729.json +132 -0
  48. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-53-30.492986.json +141 -0
  49. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T09-15-25.269759.json +0 -0
  50. results/llama-3.2-1b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T10-17-22.800022.json +130 -0
results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T20-10-17.511223.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4598685520812587,
6
+ "acc_stderr,none": 0.004973683026201962,
7
+ "acc_norm,none": 0.6090420235012945,
8
+ "acc_norm_stderr,none": 0.004869677330801213
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756238804.3530836,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10370169.769943833,
131
+ "end_time": 10370440.947691692,
132
+ "total_evaluation_time_seconds": "271.17774785868824"
133
+ }
results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T20-44-05.253280.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.45439155546703847,
6
+ "acc_stderr,none": 0.004968979259737878,
7
+ "acc_norm,none": 0.6121290579565823,
8
+ "acc_norm_stderr,none": 0.00486269059481592
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 17,
86
+ 17,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756239118.122381,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "./models/",
123
+ "model_name_sanitized": ".__models__",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10370483.85469766,
130
+ "end_time": 10372468.684128964,
131
+ "total_evaluation_time_seconds": "1984.8294313047081"
132
+ }
results/gemma-3-1b-pt-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T20-05-03.343796.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T20-46-37.211472.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7372143634385201,
6
+ "acc_stderr,none": 0.01026935406814087,
7
+ "acc_norm,none": 0.7415669205658324,
8
+ "acc_norm_stderr,none": 0.010213971636773348
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "./models/",
50
+ "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756241146.1941133,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "./models/",
121
+ "model_name_sanitized": ".__models__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10372511.903404668,
128
+ "end_time": 10372620.646790544,
129
+ "total_evaluation_time_seconds": "108.7433858755976"
130
+ }
results/gemma-3-1b-pt-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T21-10-30.485409.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.3350423539901917,
6
+ "exact_match_stderr,remove_whitespace": 0.0035237031863525254
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "./models/",
64
+ "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756241298.2547183,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "./models/",
128
+ "model_name_sanitized": ".__models__",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10372663.582597185,
135
+ "end_time": 10374053.921114726,
136
+ "total_evaluation_time_seconds": "1390.3385175410658"
137
+ }
results/gemma-3-1b-pt-q3_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-23-37.420421.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4614618601872137,
6
+ "acc_stderr,none": 0.004974937803907778,
7
+ "acc_norm,none": 0.608743278231428,
8
+ "acc_norm_stderr,none": 0.004870342592914952
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756207197.8950393,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
124
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10338562.802890183,
131
+ "end_time": 10338840.856532628,
132
+ "total_evaluation_time_seconds": "278.0536424443126"
133
+ }
results/gemma-3-1b-pt-q3_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-57-37.974497.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.45737900816570404,
6
+ "acc_stderr,none": 0.004971619995880016,
7
+ "acc_norm,none": 0.6138219478191596,
8
+ "acc_norm_stderr,none": 0.0048587719634691625
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 17,
86
+ 17,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756207518.121341,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
123
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10338883.798745643,
130
+ "end_time": 10340881.405883452,
131
+ "total_evaluation_time_seconds": "1997.6071378085762"
132
+ }
results/gemma-3-1b-pt-q3_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-18-15.608165.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q3_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-00-16.738180.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7372143634385201,
6
+ "acc_stderr,none": 0.01026935406814087,
7
+ "acc_norm,none": 0.7415669205658324,
8
+ "acc_norm_stderr,none": 0.010213971636773348
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
50
+ "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756209560.3927722,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
121
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10340925.41148755,
128
+ "end_time": 10341040.176555607,
129
+ "total_evaluation_time_seconds": "114.76506805792451"
130
+ }
results/gemma-3-1b-pt-q3_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-24-50.975707.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.3355439144003567,
6
+ "exact_match_stderr,remove_whitespace": 0.0035250095379466854
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
64
+ "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756209717.293979,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
128
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10341082.841870524,
135
+ "end_time": 10342514.413853284,
136
+ "total_evaluation_time_seconds": "1431.5719827599823"
137
+ }
results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T18-44-50.062799.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4640509858593906,
6
+ "acc_stderr,none": 0.004976867796583177,
7
+ "acc_norm,none": 0.6145190201155148,
8
+ "acc_norm_stderr,none": 0.004857140410776821
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756233676.0701602,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10365040.74355738,
131
+ "end_time": 10365313.498939076,
132
+ "total_evaluation_time_seconds": "272.7553816959262"
133
+ }
results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T19-20-45.909443.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.46325433180641307,
6
+ "acc_stderr,none": 0.004976288321682394,
7
+ "acc_norm,none": 0.6208922525393348,
8
+ "acc_norm_stderr,none": 0.004841734453506477
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 17,
86
+ 17,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756234118.1483746,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "./models/",
123
+ "model_name_sanitized": ".__models__",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10365356.778950576,
130
+ "end_time": 10367469.338681001,
131
+ "total_evaluation_time_seconds": "2112.559730425477"
132
+ }
results/gemma-3-1b-pt-q4_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T18-39-32.962297.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q4_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T19-23-19.234939.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7421109902067464,
6
+ "acc_stderr,none": 0.010206956662056201,
7
+ "acc_norm,none": 0.7464635473340587,
8
+ "acc_norm_stderr,none": 0.010150090834551817
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "./models/",
50
+ "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756236147.3807654,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "./models/",
121
+ "model_name_sanitized": ".__models__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10367512.699529504,
128
+ "end_time": 10367622.673312971,
129
+ "total_evaluation_time_seconds": "109.97378346696496"
130
+ }
results/gemma-3-1b-pt-q4_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T19-47-21.865123.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.34490637539010255,
6
+ "exact_match_stderr,remove_whitespace": 0.0035485813761982864
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "./models/",
64
+ "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756236300.3162215,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "./models/",
128
+ "model_name_sanitized": ".__models__",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10367665.693355573,
135
+ "end_time": 10369065.303484928,
136
+ "total_evaluation_time_seconds": "1399.6101293545216"
137
+ }
results/gemma-3-1b-pt-q4_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-00-04.167957.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.46554471220872334,
6
+ "acc_stderr,none": 0.004977919906875265,
7
+ "acc_norm,none": 0.6160127464648476,
8
+ "acc_norm_stderr,none": 0.004853608805843713
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756202133.0889144,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
124
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10333496.188994976,
131
+ "end_time": 10333827.604686547,
132
+ "total_evaluation_time_seconds": "331.41569157131016"
133
+ }
results/gemma-3-1b-pt-q4_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-34-47.776962.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.46683927504481176,
6
+ "acc_stderr,none": 0.004978795454216555,
7
+ "acc_norm,none": 0.6236805417247561,
8
+ "acc_norm_stderr,none": 0.0048347158142077054
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 15,
86
+ 19,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756202509.090127,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
123
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10333872.521076232,
130
+ "end_time": 10335911.204941303,
131
+ "total_evaluation_time_seconds": "2038.683865070343"
132
+ }
results/gemma-3-1b-pt-q4_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T09-53-47.483830.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q4_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-37-24.805509.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7404787812840044,
6
+ "acc_stderr,none": 0.010227939888174076,
7
+ "acc_norm,none": 0.7448313384113167,
8
+ "acc_norm_stderr,none": 0.010171571592521887
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
50
+ "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756204589.929225,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
121
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10335954.78639334,
128
+ "end_time": 10336068.2430083,
129
+ "total_evaluation_time_seconds": "113.45661495998502"
130
+ }
results/gemma-3-1b-pt-q4_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-01-05.515541.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.3492532322781988,
6
+ "exact_match_stderr,remove_whitespace": 0.0035590058209197333
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
64
+ "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756204746.1531723,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
128
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10336111.31547239,
135
+ "end_time": 10337488.953307116,
136
+ "total_evaluation_time_seconds": "1377.6378347259015"
137
+ }
results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T12-48-45.405550.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4690300736904999,
6
+ "acc_stderr,none": 0.004980200451851498,
7
+ "acc_norm,none": 0.6174068910575583,
8
+ "acc_norm_stderr,none": 0.004850268986903106
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756212308.6486008,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10343674.172472687,
131
+ "end_time": 10343948.842254344,
132
+ "total_evaluation_time_seconds": "274.6697816569358"
133
+ }
results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T13-22-33.247468.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4682334196375224,
6
+ "acc_stderr,none": 0.004979700695747546,
7
+ "acc_norm,none": 0.622087233618801,
8
+ "acc_norm_stderr,none": 0.004838747305783286
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 17,
86
+ 17,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756212626.8379686,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "./models/",
123
+ "model_name_sanitized": ".__models__",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10343992.036112672,
130
+ "end_time": 10345976.675650535,
131
+ "total_evaluation_time_seconds": "1984.6395378634334"
132
+ }
results/gemma-3-1b-pt-q5_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T12-43-27.880531.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q5_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T13-25-06.984222.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7486398258977149,
6
+ "acc_stderr,none": 0.010121156016819219,
7
+ "acc_norm,none": 0.7464635473340587,
8
+ "acc_norm_stderr,none": 0.010150090834551817
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "./models/",
50
+ "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756214655.268706,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "./models/",
121
+ "model_name_sanitized": ".__models__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10346020.253014293,
128
+ "end_time": 10346130.420203028,
129
+ "total_evaluation_time_seconds": "110.16718873567879"
130
+ }
results/gemma-3-1b-pt-q5_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T13-48-42.483459.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.3484172982612572,
6
+ "exact_match_stderr,remove_whitespace": 0.003557026484971732
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "./models/",
64
+ "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756214808.5410202,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "./models/",
128
+ "model_name_sanitized": ".__models__",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10346173.579368023,
135
+ "end_time": 10347545.92164026,
136
+ "total_evaluation_time_seconds": "1372.3422722369432"
137
+ }
results/gemma-3-1b-pt-q5_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-06-27.477647.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.46873132842063336,
6
+ "acc_stderr,none": 0.004980014536540145,
7
+ "acc_norm,none": 0.6190997809201354,
8
+ "acc_norm_stderr,none": 0.004846156699486519
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.0",
102
+ "date": 1756180961.7683156,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<pad>",
109
+ "0"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<eos>",
113
+ "1"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<bos>",
117
+ "2"
118
+ ],
119
+ "eot_token_id": 1,
120
+ "max_length": 32768,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
124
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": null,
129
+ "chat_template_sha": null,
130
+ "start_time": 10312326.432171715,
131
+ "end_time": 10312610.914023504,
132
+ "total_evaluation_time_seconds": "284.4818517882377"
133
+ }
results/gemma-3-1b-pt-q5_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-40-40.091612.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.47032463652658835,
6
+ "acc_stderr,none": 0.004980985384152799,
7
+ "acc_norm,none": 0.6263692491535551,
8
+ "acc_norm_stderr,none": 0.004827786289074885
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
53
+ "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf",
54
+ "tokenizer": "google/gemma-3-1b-pt"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
79
+ "model_num_parameters": 999885952,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 17,
86
+ 17,
87
+ 19,
88
+ 19
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.0",
101
+ "date": 1756181291.4446435,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<pad>",
108
+ "0"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<eos>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<bos>",
116
+ "2"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
123
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": false,
127
+ "chat_template": null,
128
+ "chat_template_sha": null,
129
+ "start_time": 10312655.172927069,
130
+ "end_time": 10314663.518843023,
131
+ "total_evaluation_time_seconds": "2008.34591595456"
132
+ }
results/gemma-3-1b-pt-q5_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-00-57.878146.json ADDED
The diff for this file is too large to render. See raw diff
 
results/gemma-3-1b-pt-q5_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-43-18.778488.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.749727965179543,
6
+ "acc_stderr,none": 0.01010656188008975,
7
+ "acc_norm,none": 0.7453754080522307,
8
+ "acc_norm_stderr,none": 0.010164432237060617
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
50
+ "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf",
51
+ "tokenizer": "google/gemma-3-1b-pt"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
76
+ "model_num_parameters": 999885952,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.0",
99
+ "date": 1756183342.0299957,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<pad>",
106
+ "0"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<eos>",
110
+ "1"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<bos>",
114
+ "2"
115
+ ],
116
+ "eot_token_id": 1,
117
+ "max_length": 32768,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
121
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 10314707.323286947,
128
+ "end_time": 10314822.215488749,
129
+ "total_evaluation_time_seconds": "114.89220180176198"
130
+ }
results/gemma-3-1b-pt-q5_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T05-08-03.555978.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.35298707088720466,
6
+ "exact_match_stderr,remove_whitespace": 0.003567700179654136
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "skymizer/gemma-3-1b-pt-GGUF",
64
+ "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf",
65
+ "tokenizer": "google/gemma-3-1b-pt"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt",
89
+ "model_num_parameters": 999885952,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.0",
106
+ "date": 1756183500.2287133,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<pad>",
113
+ "0"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<eos>",
117
+ "1"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<bos>",
121
+ "2"
122
+ ],
123
+ "eot_token_id": 1,
124
+ "max_length": 32768,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "skymizer/gemma-3-1b-pt-GGUF",
128
+ "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": false,
132
+ "chat_template": null,
133
+ "chat_template_sha": null,
134
+ "start_time": 10314865.389951872,
135
+ "end_time": 10316306.994069807,
136
+ "total_evaluation_time_seconds": "1441.6041179355234"
137
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T04-51-36.078167.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gpqa_main_zeroshot": {
4
+ "alias": "gpqa_main_zeroshot",
5
+ "acc,none": 0.27232142857142855,
6
+ "acc_stderr,none": 0.02105508212932411,
7
+ "acc_norm,none": 0.27232142857142855,
8
+ "acc_norm_stderr,none": 0.02105508212932411
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gpqa_main_zeroshot": []
13
+ },
14
+ "configs": {
15
+ "gpqa_main_zeroshot": {
16
+ "task": "gpqa_main_zeroshot",
17
+ "tag": "gpqa",
18
+ "dataset_path": "Idavidrein/gpqa",
19
+ "dataset_name": "gpqa_main",
20
+ "training_split": "train",
21
+ "validation_split": "train",
22
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
23
+ "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:",
24
+ "doc_to_target": "answer",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": [
27
+ "(A)",
28
+ "(B)",
29
+ "(C)",
30
+ "(D)"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 1.0,
53
+ "pretrained": "./models/",
54
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
55
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
56
+ }
57
+ }
58
+ },
59
+ "versions": {
60
+ "gpqa_main_zeroshot": 1.0
61
+ },
62
+ "n-shot": {
63
+ "gpqa_main_zeroshot": 0
64
+ },
65
+ "higher_is_better": {
66
+ "gpqa_main_zeroshot": {
67
+ "acc": true,
68
+ "acc_norm": true
69
+ }
70
+ },
71
+ "n-samples": {
72
+ "gpqa_main_zeroshot": {
73
+ "original": 448,
74
+ "effective": 448
75
+ }
76
+ },
77
+ "config": {
78
+ "model": "hf",
79
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
80
+ "model_num_parameters": 8030261248,
81
+ "model_dtype": "torch.float32",
82
+ "model_revision": "main",
83
+ "model_sha": "",
84
+ "batch_size": "auto:4",
85
+ "batch_sizes": [
86
+ 9,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756442725.230191,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
130
+ "start_time": 6788353.722110151,
131
+ "end_time": 6788886.198874184,
132
+ "total_evaluation_time_seconds": "532.4767640326172"
133
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-28T23-45-08.122514.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.5750846444931289,
6
+ "acc_stderr,none": 0.00493319877670009,
7
+ "acc_norm,none": 0.734017128062139,
8
+ "acc_norm_stderr,none": 0.004409521343139737
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
79
+ "model_num_parameters": 8030261248,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756423105.123428,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": true,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
130
+ "start_time": 6768754.425889828,
131
+ "end_time": 6770498.242228656,
132
+ "total_evaluation_time_seconds": "1743.8163388278335"
133
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T03-38-59.589582.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.5942043417645887,
6
+ "acc_stderr,none": 0.004900417982582057,
7
+ "acc_norm,none": 0.7797251543517227,
8
+ "acc_norm_stderr,none": 0.004135849642817268
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
79
+ "model_num_parameters": 8030261248,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 19,
86
+ 19,
87
+ 22,
88
+ 22
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.1",
101
+ "date": 1756424948.985949,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<|eot_id|>",
108
+ "128009"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<|eot_id|>",
112
+ "128009"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<|begin_of_text|>",
116
+ "128000"
117
+ ],
118
+ "eot_token_id": 128009,
119
+ "max_length": 131072,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "./models/",
123
+ "model_name_sanitized": ".__models__",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": true,
127
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
128
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
129
+ "start_time": 6770604.923830719,
130
+ "end_time": 6784529.707175097,
131
+ "total_evaluation_time_seconds": "13924.783344378695"
132
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T13-45-33.534718.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "ifeval": {
4
+ "alias": "ifeval",
5
+ "prompt_level_strict_acc,none": 0.6987060998151571,
6
+ "prompt_level_strict_acc_stderr,none": 0.019744473483514356,
7
+ "inst_level_strict_acc,none": 0.7817745803357314,
8
+ "inst_level_strict_acc_stderr,none": "N/A",
9
+ "prompt_level_loose_acc,none": 0.7412199630314233,
10
+ "prompt_level_loose_acc_stderr,none": 0.018846992560712525,
11
+ "inst_level_loose_acc,none": 0.8141486810551559,
12
+ "inst_level_loose_acc_stderr,none": "N/A"
13
+ }
14
+ },
15
+ "group_subtasks": {
16
+ "ifeval": []
17
+ },
18
+ "configs": {
19
+ "ifeval": {
20
+ "task": "ifeval",
21
+ "dataset_path": "google/IFEval",
22
+ "test_split": "train",
23
+ "doc_to_text": "prompt",
24
+ "doc_to_target": 0,
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "prompt_level_strict_acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "inst_level_strict_acc",
39
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "prompt_level_loose_acc",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "inst_level_loose_acc",
49
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "generate_until",
54
+ "generation_kwargs": {
55
+ "until": [],
56
+ "do_sample": false,
57
+ "temperature": 0.0,
58
+ "max_gen_toks": 1280
59
+ },
60
+ "repeats": 1,
61
+ "should_decontaminate": false,
62
+ "metadata": {
63
+ "version": 4.0,
64
+ "pretrained": "./models/",
65
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
66
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
67
+ }
68
+ }
69
+ },
70
+ "versions": {
71
+ "ifeval": 4.0
72
+ },
73
+ "n-shot": {
74
+ "ifeval": 0
75
+ },
76
+ "higher_is_better": {
77
+ "ifeval": {
78
+ "prompt_level_strict_acc": true,
79
+ "inst_level_strict_acc": true,
80
+ "prompt_level_loose_acc": true,
81
+ "inst_level_loose_acc": true
82
+ }
83
+ },
84
+ "n-samples": {
85
+ "ifeval": {
86
+ "original": 541,
87
+ "effective": 541
88
+ }
89
+ },
90
+ "config": {
91
+ "model": "hf",
92
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
93
+ "model_num_parameters": 8030261248,
94
+ "model_dtype": "torch.float32",
95
+ "model_revision": "main",
96
+ "model_sha": "",
97
+ "batch_size": "auto:4",
98
+ "batch_sizes": [],
99
+ "device": null,
100
+ "use_cache": null,
101
+ "limit": null,
102
+ "bootstrap_iters": 100000,
103
+ "gen_kwargs": null,
104
+ "random_seed": 0,
105
+ "numpy_seed": 1234,
106
+ "torch_seed": 1234,
107
+ "fewshot_seed": 1234
108
+ },
109
+ "git_hash": "v0.1.1",
110
+ "date": 1756471974.2640414,
111
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
112
+ "transformers_version": "4.55.4",
113
+ "lm_eval_version": "0.4.8",
114
+ "upper_git_hash": null,
115
+ "tokenizer_pad_token": [
116
+ "<|eot_id|>",
117
+ "128009"
118
+ ],
119
+ "tokenizer_eos_token": [
120
+ "<|eot_id|>",
121
+ "128009"
122
+ ],
123
+ "tokenizer_bos_token": [
124
+ "<|begin_of_text|>",
125
+ "128000"
126
+ ],
127
+ "eot_token_id": 128009,
128
+ "max_length": 131072,
129
+ "task_hashes": {},
130
+ "model_source": "hf",
131
+ "model_name": "./models/",
132
+ "model_name_sanitized": ".__models__",
133
+ "system_instruction": null,
134
+ "system_instruction_sha": null,
135
+ "fewshot_as_multiturn": false,
136
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
137
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
138
+ "start_time": 6817664.029671304,
139
+ "end_time": 6820923.655691498,
140
+ "total_evaluation_time_seconds": "3259.6260201940313"
141
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-28T23-14-12.467699.json ADDED
The diff for this file is too large to render. See raw diff
 
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T03-45-45.219594.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.795429815016322,
6
+ "acc_stderr,none": 0.009411688039193577,
7
+ "acc_norm,none": 0.794885745375408,
8
+ "acc_norm_stderr,none": 0.009420971671018023
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "./models/",
50
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
51
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
76
+ "model_num_parameters": 8030261248,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.1",
99
+ "date": 1756438945.0546112,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<|eot_id|>",
106
+ "128009"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<|eot_id|>",
110
+ "128009"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<|begin_of_text|>",
114
+ "128000"
115
+ ],
116
+ "eot_token_id": 128009,
117
+ "max_length": 131072,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "./models/",
121
+ "model_name_sanitized": ".__models__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": true,
125
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
126
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
127
+ "start_time": 6784623.486849175,
128
+ "end_time": 6784935.339727577,
129
+ "total_evaluation_time_seconds": "311.85287840198725"
130
+ }
results/llama-3.1-8b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T04-41-01.642818.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.5720575122603656,
6
+ "exact_match_stderr,remove_whitespace": 0.0036937289351404315
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "./models/",
64
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf",
65
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
89
+ "model_num_parameters": 8030261248,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.1",
106
+ "date": 1756439418.136281,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<|eot_id|>",
117
+ "128009"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<|begin_of_text|>",
121
+ "128000"
122
+ ],
123
+ "eot_token_id": 128009,
124
+ "max_length": 131072,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "./models/",
128
+ "model_name_sanitized": ".__models__",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": true,
132
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
133
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
134
+ "start_time": 6785063.112292615,
135
+ "end_time": 6788251.762945544,
136
+ "total_evaluation_time_seconds": "3188.650652929209"
137
+ }
results/llama-3.1-8b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-19-24.887264.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gpqa_main_zeroshot": {
4
+ "alias": "gpqa_main_zeroshot",
5
+ "acc,none": 0.28125,
6
+ "acc_stderr,none": 0.021265785688273954,
7
+ "acc_norm,none": 0.28125,
8
+ "acc_norm_stderr,none": 0.021265785688273954
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gpqa_main_zeroshot": []
13
+ },
14
+ "configs": {
15
+ "gpqa_main_zeroshot": {
16
+ "task": "gpqa_main_zeroshot",
17
+ "tag": "gpqa",
18
+ "dataset_path": "Idavidrein/gpqa",
19
+ "dataset_name": "gpqa_main",
20
+ "training_split": "train",
21
+ "validation_split": "train",
22
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
23
+ "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:",
24
+ "doc_to_target": "answer",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": [
27
+ "(A)",
28
+ "(B)",
29
+ "(C)",
30
+ "(D)"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 1.0,
53
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
54
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
55
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
56
+ }
57
+ }
58
+ },
59
+ "versions": {
60
+ "gpqa_main_zeroshot": 1.0
61
+ },
62
+ "n-shot": {
63
+ "gpqa_main_zeroshot": 0
64
+ },
65
+ "higher_is_better": {
66
+ "gpqa_main_zeroshot": {
67
+ "acc": true,
68
+ "acc_norm": true
69
+ }
70
+ },
71
+ "n-samples": {
72
+ "gpqa_main_zeroshot": {
73
+ "original": 448,
74
+ "effective": 448
75
+ }
76
+ },
77
+ "config": {
78
+ "model": "hf",
79
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
80
+ "model_num_parameters": 8030261248,
81
+ "model_dtype": "torch.float32",
82
+ "model_revision": "main",
83
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
84
+ "batch_size": "auto:4",
85
+ "batch_sizes": [
86
+ 9,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756397596.8580039,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
124
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
130
+ "start_time": 6743314.347175269,
131
+ "end_time": 6743755.008235267,
132
+ "total_evaluation_time_seconds": "440.6610599979758"
133
+ }
results/llama-3.1-8b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T11-17-57.196185.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.5762796255725952,
6
+ "acc_stderr,none": 0.0049313726571298755,
7
+ "acc_norm,none": 0.7341167098187612,
8
+ "acc_norm_stderr,none": 0.00440899486864994
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
53
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
54
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
79
+ "model_num_parameters": 8030261248,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756378309.7240536,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
124
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": true,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
130
+ "start_time": 6724033.028550806,
131
+ "end_time": 6725667.316656574,
132
+ "total_evaluation_time_seconds": "1634.2881057672203"
133
+ }
results/llama-3.1-8b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-11-18.604003.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.5954989046006771,
6
+ "acc_stderr,none": 0.004897921845492068,
7
+ "acc_norm,none": 0.780920135431189,
8
+ "acc_norm_stderr,none": 0.004127775403148651
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
53
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
54
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
79
+ "model_num_parameters": 8030261248,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 19,
86
+ 19,
87
+ 22,
88
+ 22
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.1",
101
+ "date": 1756379994.0238812,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<|eot_id|>",
108
+ "128009"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<|eot_id|>",
112
+ "128009"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<|begin_of_text|>",
116
+ "128000"
117
+ ],
118
+ "eot_token_id": 128009,
119
+ "max_length": 131072,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
123
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": true,
127
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
128
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
129
+ "start_time": 6725716.583365066,
130
+ "end_time": 6739668.724583514,
131
+ "total_evaluation_time_seconds": "13952.141218448058"
132
+ }
results/llama-3.1-8b-instruct-q3_k_m/ifeval/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-29T11-43-19.960215.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "ifeval": {
4
+ "alias": "ifeval",
5
+ "prompt_level_strict_acc,none": 0.711645101663586,
6
+ "prompt_level_strict_acc_stderr,none": 0.019493890350654804,
7
+ "inst_level_strict_acc,none": 0.790167865707434,
8
+ "inst_level_strict_acc_stderr,none": "N/A",
9
+ "prompt_level_loose_acc,none": 0.7597042513863216,
10
+ "prompt_level_loose_acc_stderr,none": 0.018386473581487088,
11
+ "inst_level_loose_acc,none": 0.8237410071942446,
12
+ "inst_level_loose_acc_stderr,none": "N/A"
13
+ }
14
+ },
15
+ "group_subtasks": {
16
+ "ifeval": []
17
+ },
18
+ "configs": {
19
+ "ifeval": {
20
+ "task": "ifeval",
21
+ "dataset_path": "google/IFEval",
22
+ "test_split": "train",
23
+ "doc_to_text": "prompt",
24
+ "doc_to_target": 0,
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "prompt_level_strict_acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "inst_level_strict_acc",
39
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "prompt_level_loose_acc",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "inst_level_loose_acc",
49
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "generate_until",
54
+ "generation_kwargs": {
55
+ "until": [],
56
+ "do_sample": false,
57
+ "temperature": 0.0,
58
+ "max_gen_toks": 1280
59
+ },
60
+ "repeats": 1,
61
+ "should_decontaminate": false,
62
+ "metadata": {
63
+ "version": 4.0,
64
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
65
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
66
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
67
+ }
68
+ }
69
+ },
70
+ "versions": {
71
+ "ifeval": 4.0
72
+ },
73
+ "n-shot": {
74
+ "ifeval": 0
75
+ },
76
+ "higher_is_better": {
77
+ "ifeval": {
78
+ "prompt_level_strict_acc": true,
79
+ "inst_level_strict_acc": true,
80
+ "prompt_level_loose_acc": true,
81
+ "inst_level_loose_acc": true
82
+ }
83
+ },
84
+ "n-samples": {
85
+ "ifeval": {
86
+ "original": 541,
87
+ "effective": 541
88
+ }
89
+ },
90
+ "config": {
91
+ "model": "hf",
92
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
93
+ "model_num_parameters": 8030261248,
94
+ "model_dtype": "torch.float32",
95
+ "model_revision": "main",
96
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
97
+ "batch_size": "auto:4",
98
+ "batch_sizes": [],
99
+ "device": null,
100
+ "use_cache": null,
101
+ "limit": null,
102
+ "bootstrap_iters": 100000,
103
+ "gen_kwargs": null,
104
+ "random_seed": 0,
105
+ "numpy_seed": 1234,
106
+ "torch_seed": 1234,
107
+ "fewshot_seed": 1234
108
+ },
109
+ "git_hash": "v0.1.1",
110
+ "date": 1756464684.2343795,
111
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
112
+ "transformers_version": "4.55.4",
113
+ "lm_eval_version": "0.4.8",
114
+ "upper_git_hash": null,
115
+ "tokenizer_pad_token": [
116
+ "<|eot_id|>",
117
+ "128009"
118
+ ],
119
+ "tokenizer_eos_token": [
120
+ "<|eot_id|>",
121
+ "128009"
122
+ ],
123
+ "tokenizer_bos_token": [
124
+ "<|begin_of_text|>",
125
+ "128000"
126
+ ],
127
+ "eot_token_id": 128009,
128
+ "max_length": 131072,
129
+ "task_hashes": {},
130
+ "model_source": "hf",
131
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
132
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
133
+ "system_instruction": null,
134
+ "system_instruction_sha": null,
135
+ "fewshot_as_multiturn": false,
136
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
137
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
138
+ "start_time": 6810343.246271214,
139
+ "end_time": 6813590.080971116,
140
+ "total_evaluation_time_seconds": "3246.834699901752"
141
+ }
results/llama-3.1-8b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T10-49-52.307915.json ADDED
The diff for this file is too large to render. See raw diff
 
results/llama-3.1-8b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-17-14.136330.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7976060935799782,
6
+ "acc_stderr,none": 0.009374289682807648,
7
+ "acc_norm,none": 0.794885745375408,
8
+ "acc_norm_stderr,none": 0.009420971671018023
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
50
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
51
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
76
+ "model_num_parameters": 8030261248,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.1",
99
+ "date": 1756394025.9041305,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<|eot_id|>",
106
+ "128009"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<|eot_id|>",
110
+ "128009"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<|begin_of_text|>",
114
+ "128000"
115
+ ],
116
+ "eot_token_id": 128009,
117
+ "max_length": 131072,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
121
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": true,
125
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
126
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
127
+ "start_time": 6739723.437865958,
128
+ "end_time": 6740024.257117974,
129
+ "total_evaluation_time_seconds": "300.8192520160228"
130
+ }
results/llama-3.1-8b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-11-09.665476.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "triviaqa": {
4
+ "alias": "triviaqa",
5
+ "exact_match,remove_whitespace": 0.5716116807846634,
6
+ "exact_match_stderr,remove_whitespace": 0.0036942121228731735
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "triviaqa": []
11
+ },
12
+ "configs": {
13
+ "triviaqa": {
14
+ "task": "triviaqa",
15
+ "dataset_path": "trivia_qa",
16
+ "dataset_name": "rc.nocontext",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "Question: {{question}}?\nAnswer:",
20
+ "doc_to_target": "{{answer.aliases}}",
21
+ "unsafe_code": false,
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true,
31
+ "ignore_case": true,
32
+ "ignore_punctuation": true
33
+ }
34
+ ],
35
+ "output_type": "generate_until",
36
+ "generation_kwargs": {
37
+ "until": [
38
+ "\n",
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0
44
+ },
45
+ "repeats": 1,
46
+ "filter_list": [
47
+ {
48
+ "name": "remove_whitespace",
49
+ "filter": [
50
+ {
51
+ "function": "remove_whitespace"
52
+ },
53
+ {
54
+ "function": "take_first"
55
+ }
56
+ ]
57
+ }
58
+ ],
59
+ "should_decontaminate": true,
60
+ "doc_to_decontamination_query": "question",
61
+ "metadata": {
62
+ "version": 3.0,
63
+ "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF",
64
+ "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf",
65
+ "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct"
66
+ }
67
+ }
68
+ },
69
+ "versions": {
70
+ "triviaqa": 3.0
71
+ },
72
+ "n-shot": {
73
+ "triviaqa": 5
74
+ },
75
+ "higher_is_better": {
76
+ "triviaqa": {
77
+ "exact_match": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "triviaqa": {
82
+ "original": 17944,
83
+ "effective": 17944
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct",
89
+ "model_num_parameters": 8030261248,
90
+ "model_dtype": "torch.float32",
91
+ "model_revision": "main",
92
+ "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f",
93
+ "batch_size": "auto:4",
94
+ "batch_sizes": [],
95
+ "device": null,
96
+ "use_cache": null,
97
+ "limit": null,
98
+ "bootstrap_iters": 100000,
99
+ "gen_kwargs": null,
100
+ "random_seed": 0,
101
+ "numpy_seed": 1234,
102
+ "torch_seed": 1234,
103
+ "fewshot_seed": 1234
104
+ },
105
+ "git_hash": "v0.1.1",
106
+ "date": 1756394437.5566554,
107
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
108
+ "transformers_version": "4.55.4",
109
+ "lm_eval_version": "0.4.8",
110
+ "upper_git_hash": null,
111
+ "tokenizer_pad_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_eos_token": [
116
+ "<|eot_id|>",
117
+ "128009"
118
+ ],
119
+ "tokenizer_bos_token": [
120
+ "<|begin_of_text|>",
121
+ "128000"
122
+ ],
123
+ "eot_token_id": 128009,
124
+ "max_length": 131072,
125
+ "task_hashes": {},
126
+ "model_source": "hf",
127
+ "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF",
128
+ "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF",
129
+ "system_instruction": null,
130
+ "system_instruction_sha": null,
131
+ "fewshot_as_multiturn": true,
132
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
133
+ "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65",
134
+ "start_time": 6740125.041809197,
135
+ "end_time": 6743259.785275262,
136
+ "total_evaluation_time_seconds": "3134.7434660652652"
137
+ }
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T10-43-39.403807.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gpqa_main_zeroshot": {
4
+ "alias": "gpqa_main_zeroshot",
5
+ "acc,none": 0.28348214285714285,
6
+ "acc_stderr,none": 0.0213168289872622,
7
+ "acc_norm,none": 0.28348214285714285,
8
+ "acc_norm_stderr,none": 0.0213168289872622
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gpqa_main_zeroshot": []
13
+ },
14
+ "configs": {
15
+ "gpqa_main_zeroshot": {
16
+ "task": "gpqa_main_zeroshot",
17
+ "tag": "gpqa",
18
+ "dataset_path": "Idavidrein/gpqa",
19
+ "dataset_name": "gpqa_main",
20
+ "training_split": "train",
21
+ "validation_split": "train",
22
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
23
+ "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:",
24
+ "doc_to_target": "answer",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": [
27
+ "(A)",
28
+ "(B)",
29
+ "(C)",
30
+ "(D)"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 1.0,
53
+ "pretrained": "./models/",
54
+ "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf",
55
+ "tokenizer": "meta-llama/Llama-3.2-1B-Instruct"
56
+ }
57
+ }
58
+ },
59
+ "versions": {
60
+ "gpqa_main_zeroshot": 1.0
61
+ },
62
+ "n-shot": {
63
+ "gpqa_main_zeroshot": 0
64
+ },
65
+ "higher_is_better": {
66
+ "gpqa_main_zeroshot": {
67
+ "acc": true,
68
+ "acc_norm": true
69
+ }
70
+ },
71
+ "n-samples": {
72
+ "gpqa_main_zeroshot": {
73
+ "original": 448,
74
+ "effective": 448
75
+ }
76
+ },
77
+ "config": {
78
+ "model": "hf",
79
+ "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct",
80
+ "model_num_parameters": 1235814400,
81
+ "model_dtype": "torch.float32",
82
+ "model_revision": "main",
83
+ "model_sha": "",
84
+ "batch_size": "auto:4",
85
+ "batch_sizes": [
86
+ 13,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756464097.7877123,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": false,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4",
130
+ "start_time": 6809802.016860311,
131
+ "end_time": 6810009.524530667,
132
+ "total_evaluation_time_seconds": "207.50767035596073"
133
+ }
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T09-23-04.950976.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4231228838876718,
6
+ "acc_stderr,none": 0.004930448527146583,
7
+ "acc_norm,none": 0.5246962756423024,
8
+ "acc_norm_stderr,none": 0.004983691099110917
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "meta-llama/Llama-3.2-1B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 0
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct",
79
+ "model_num_parameters": 1235814400,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 64,
86
+ 64,
87
+ 64,
88
+ 64,
89
+ 64
90
+ ],
91
+ "device": null,
92
+ "use_cache": null,
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "v0.1.1",
102
+ "date": 1756459040.171013,
103
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
104
+ "transformers_version": "4.55.4",
105
+ "lm_eval_version": "0.4.8",
106
+ "upper_git_hash": null,
107
+ "tokenizer_pad_token": [
108
+ "<|eot_id|>",
109
+ "128009"
110
+ ],
111
+ "tokenizer_eos_token": [
112
+ "<|eot_id|>",
113
+ "128009"
114
+ ],
115
+ "tokenizer_bos_token": [
116
+ "<|begin_of_text|>",
117
+ "128000"
118
+ ],
119
+ "eot_token_id": 128009,
120
+ "max_length": 131072,
121
+ "task_hashes": {},
122
+ "model_source": "hf",
123
+ "model_name": "./models/",
124
+ "model_name_sanitized": ".__models__",
125
+ "system_instruction": null,
126
+ "system_instruction_sha": null,
127
+ "fewshot_as_multiturn": true,
128
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
129
+ "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4",
130
+ "start_time": 6804763.703988922,
131
+ "end_time": 6805175.0714428,
132
+ "total_evaluation_time_seconds": "411.3674538778141"
133
+ }
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T10-13-20.039729.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.4374626568412667,
6
+ "acc_stderr,none": 0.004950598300667601,
7
+ "acc_norm,none": 0.576777534355706,
8
+ "acc_norm_stderr,none": 0.004930603061590628
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "./models/",
53
+ "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf",
54
+ "tokenizer": "meta-llama/Llama-3.2-1B-Instruct"
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "hellaswag": 1.0
60
+ },
61
+ "n-shot": {
62
+ "hellaswag": 10
63
+ },
64
+ "higher_is_better": {
65
+ "hellaswag": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "hellaswag": {
72
+ "original": 10042,
73
+ "effective": 10042
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct",
79
+ "model_num_parameters": 1235814400,
80
+ "model_dtype": "torch.float32",
81
+ "model_revision": "main",
82
+ "model_sha": "",
83
+ "batch_size": "auto:4",
84
+ "batch_sizes": [
85
+ 32,
86
+ 32,
87
+ 32,
88
+ 32
89
+ ],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "v0.1.1",
101
+ "date": 1756459592.3099344,
102
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
103
+ "transformers_version": "4.55.4",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<|eot_id|>",
108
+ "128009"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<|eot_id|>",
112
+ "128009"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<|begin_of_text|>",
116
+ "128000"
117
+ ],
118
+ "eot_token_id": 128009,
119
+ "max_length": 131072,
120
+ "task_hashes": {},
121
+ "model_source": "hf",
122
+ "model_name": "./models/",
123
+ "model_name_sanitized": ".__models__",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "fewshot_as_multiturn": true,
127
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
128
+ "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4",
129
+ "start_time": 6805284.145371093,
130
+ "end_time": 6808190.157087202,
131
+ "total_evaluation_time_seconds": "2906.011716108769"
132
+ }
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-53-30.492986.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "ifeval": {
4
+ "alias": "ifeval",
5
+ "prompt_level_strict_acc,none": 0.4232902033271719,
6
+ "prompt_level_strict_acc_stderr,none": 0.021261842325248494,
7
+ "inst_level_strict_acc,none": 0.5599520383693045,
8
+ "inst_level_strict_acc_stderr,none": "N/A",
9
+ "prompt_level_loose_acc,none": 0.46210720887245843,
10
+ "prompt_level_loose_acc_stderr,none": 0.021454695436204742,
11
+ "inst_level_loose_acc,none": 0.592326139088729,
12
+ "inst_level_loose_acc_stderr,none": "N/A"
13
+ }
14
+ },
15
+ "group_subtasks": {
16
+ "ifeval": []
17
+ },
18
+ "configs": {
19
+ "ifeval": {
20
+ "task": "ifeval",
21
+ "dataset_path": "google/IFEval",
22
+ "test_split": "train",
23
+ "doc_to_text": "prompt",
24
+ "doc_to_target": 0,
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "prompt_level_strict_acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "inst_level_strict_acc",
39
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "prompt_level_loose_acc",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "inst_level_loose_acc",
49
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "generate_until",
54
+ "generation_kwargs": {
55
+ "until": [],
56
+ "do_sample": false,
57
+ "temperature": 0.0,
58
+ "max_gen_toks": 1280
59
+ },
60
+ "repeats": 1,
61
+ "should_decontaminate": false,
62
+ "metadata": {
63
+ "version": 4.0,
64
+ "pretrained": "./models/",
65
+ "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf",
66
+ "tokenizer": "meta-llama/Llama-3.2-1B-Instruct"
67
+ }
68
+ }
69
+ },
70
+ "versions": {
71
+ "ifeval": 4.0
72
+ },
73
+ "n-shot": {
74
+ "ifeval": 0
75
+ },
76
+ "higher_is_better": {
77
+ "ifeval": {
78
+ "prompt_level_strict_acc": true,
79
+ "inst_level_strict_acc": true,
80
+ "prompt_level_loose_acc": true,
81
+ "inst_level_loose_acc": true
82
+ }
83
+ },
84
+ "n-samples": {
85
+ "ifeval": {
86
+ "original": 541,
87
+ "effective": 541
88
+ }
89
+ },
90
+ "config": {
91
+ "model": "hf",
92
+ "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct",
93
+ "model_num_parameters": 1235814400,
94
+ "model_dtype": "torch.float32",
95
+ "model_revision": "main",
96
+ "model_sha": "",
97
+ "batch_size": "auto:4",
98
+ "batch_sizes": [],
99
+ "device": null,
100
+ "use_cache": null,
101
+ "limit": null,
102
+ "bootstrap_iters": 100000,
103
+ "gen_kwargs": null,
104
+ "random_seed": 0,
105
+ "numpy_seed": 1234,
106
+ "torch_seed": 1234,
107
+ "fewshot_seed": 1234
108
+ },
109
+ "git_hash": "v0.1.1",
110
+ "date": 1756477906.445423,
111
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
112
+ "transformers_version": "4.55.4",
113
+ "lm_eval_version": "0.4.8",
114
+ "upper_git_hash": null,
115
+ "tokenizer_pad_token": [
116
+ "<|eot_id|>",
117
+ "128009"
118
+ ],
119
+ "tokenizer_eos_token": [
120
+ "<|eot_id|>",
121
+ "128009"
122
+ ],
123
+ "tokenizer_bos_token": [
124
+ "<|begin_of_text|>",
125
+ "128000"
126
+ ],
127
+ "eot_token_id": 128009,
128
+ "max_length": 131072,
129
+ "task_hashes": {},
130
+ "model_source": "hf",
131
+ "model_name": "./models/",
132
+ "model_name_sanitized": ".__models__",
133
+ "system_instruction": null,
134
+ "system_instruction_sha": null,
135
+ "fewshot_as_multiturn": false,
136
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
137
+ "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4",
138
+ "start_time": 6823613.272541048,
139
+ "end_time": 6825000.613895395,
140
+ "total_evaluation_time_seconds": "1387.3413543468341"
141
+ }
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T09-15-25.269759.json ADDED
The diff for this file is too large to render. See raw diff
 
results/llama-3.2-1b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T10-17-22.800022.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.6936887921653971,
6
+ "acc_stderr,none": 0.010754970032367363,
7
+ "acc_norm,none": 0.6996735582154516,
8
+ "acc_norm_stderr,none": 0.010695225308183266
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "baber/piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0,
49
+ "pretrained": "./models/",
50
+ "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf",
51
+ "tokenizer": "meta-llama/Llama-3.2-1B-Instruct"
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "piqa": 1.0
57
+ },
58
+ "n-shot": {
59
+ "piqa": 0
60
+ },
61
+ "higher_is_better": {
62
+ "piqa": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "piqa": {
69
+ "original": 1838,
70
+ "effective": 1838
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct",
76
+ "model_num_parameters": 1235814400,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "",
80
+ "batch_size": "auto:4",
81
+ "batch_sizes": [
82
+ 64,
83
+ 64,
84
+ 64,
85
+ 64,
86
+ 64
87
+ ],
88
+ "device": null,
89
+ "use_cache": null,
90
+ "limit": null,
91
+ "bootstrap_iters": 100000,
92
+ "gen_kwargs": null,
93
+ "random_seed": 0,
94
+ "numpy_seed": 1234,
95
+ "torch_seed": 1234,
96
+ "fewshot_seed": 1234
97
+ },
98
+ "git_hash": "v0.1.1",
99
+ "date": 1756462580.8541045,
100
+ "pretty_env_info": "'NoneType' object has no attribute 'splitlines'",
101
+ "transformers_version": "4.55.4",
102
+ "lm_eval_version": "0.4.8",
103
+ "upper_git_hash": null,
104
+ "tokenizer_pad_token": [
105
+ "<|eot_id|>",
106
+ "128009"
107
+ ],
108
+ "tokenizer_eos_token": [
109
+ "<|eot_id|>",
110
+ "128009"
111
+ ],
112
+ "tokenizer_bos_token": [
113
+ "<|begin_of_text|>",
114
+ "128000"
115
+ ],
116
+ "eot_token_id": 128009,
117
+ "max_length": 131072,
118
+ "task_hashes": {},
119
+ "model_source": "hf",
120
+ "model_name": "./models/",
121
+ "model_name_sanitized": ".__models__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": true,
125
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
126
+ "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4",
127
+ "start_time": 6808297.029727637,
128
+ "end_time": 6808432.920860001,
129
+ "total_evaluation_time_seconds": "135.89113236404955"
130
+ }