diff --git a/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T20-10-17.511223.json b/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T20-10-17.511223.json new file mode 100644 index 0000000000000000000000000000000000000000..ff8e6470d8d18f369917dca7921e9e006c99d9e5 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T20-10-17.511223.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4598685520812587, + "acc_stderr,none": 0.004973683026201962, + "acc_norm,none": 0.6090420235012945, + "acc_norm_stderr,none": 0.004869677330801213 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756238804.3530836, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10370169.769943833, + "end_time": 10370440.947691692, + "total_evaluation_time_seconds": "271.17774785868824" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T20-44-05.253280.json b/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T20-44-05.253280.json new file mode 100644 index 0000000000000000000000000000000000000000..c3ad5760dc63b4af26271b46f2c94f8ab8c6d3d6 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T20-44-05.253280.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45439155546703847, + "acc_stderr,none": 0.004968979259737878, + "acc_norm,none": 0.6121290579565823, + "acc_norm_stderr,none": 0.00486269059481592 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 17, + 17, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756239118.122381, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10370483.85469766, + "end_time": 10372468.684128964, + "total_evaluation_time_seconds": "1984.8294313047081" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T20-05-03.343796.json b/results/gemma-3-1b-pt-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T20-05-03.343796.json new file mode 100644 index 0000000000000000000000000000000000000000..1c66bffde6987b636f768a1b267851b32774e40f --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T20-05-03.343796.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.2553767269619712, + "acc_stderr,none": 0.0036744672232731593, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2718384697130712, + "acc_stderr,none": 0.006478056366785472, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.26582278481012656, + "acc_stderr,none": 0.02875679962965839 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.38016528925619836, + "acc_stderr,none": 0.044313245019684325 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.23148148148148148, + "acc_stderr,none": 0.040774947092526284 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.2976878612716763, + "acc_stderr,none": 0.02461705538867703 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.3022508038585209, + "acc_stderr,none": 0.026082700695399676 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.29012345679012347, + "acc_stderr,none": 0.02525117393649498 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27053455019556716, + "acc_stderr,none": 0.011345996743539222 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.29239766081871343, + "acc_stderr,none": 0.03488647713457921 + }, + "mmlu_other": { + "acc,none": 0.23913743160605086, + "acc_stderr,none": 0.007641968540146066, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2188679245283019, + "acc_stderr,none": 0.025447863825108594 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.2543352601156069, + "acc_stderr,none": 0.03320556443085566 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.20179372197309417, + "acc_stderr,none": 0.02693611191280226 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.1941747572815534, + "acc_stderr,none": 0.03916667762822582 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.2564102564102564, + "acc_stderr,none": 0.02860595370200432 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2707535121328225, + "acc_stderr,none": 0.01588988836256046 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.25163398692810457, + "acc_stderr,none": 0.024848018263875137 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.2695035460992908, + "acc_stderr,none": 0.02646903681859067 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.16544117647058823, + "acc_stderr,none": 0.022571771025494763 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.20481927710843373, + "acc_stderr,none": 0.031417842916639266 + }, + "mmlu_social_sciences": { + "acc,none": 0.23756906077348067, + "acc_stderr,none": 0.0076759851859417305, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.24561403508771928, + "acc_stderr,none": 0.040493392977481425 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.25252525252525254, + "acc_stderr,none": 0.03095405547036587 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.22797927461139897, + "acc_stderr,none": 0.030276909945178256 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2128205128205128, + "acc_stderr,none": 0.020752423722128037 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.21008403361344538, + "acc_stderr,none": 0.026461398717471864 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.22201834862385322, + "acc_stderr,none": 0.017818849564796544 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.22900763358778625, + "acc_stderr,none": 0.036853466317118506 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.2761437908496732, + "acc_stderr,none": 0.018087276935663185 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.20909090909090908, + "acc_stderr,none": 0.03895091015724137 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.24081632653061225, + "acc_stderr,none": 0.027372942201788125 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.24378109452736318, + "acc_stderr,none": 0.03036049015401464 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_stem": { + "acc,none": 0.26419283222327944, + "acc_stderr,none": 0.007843601275189417, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04072314811876841 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.3026315789473684, + "acc_stderr,none": 0.03738520676119667 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2569444444444444, + "acc_stderr,none": 0.03653946969442102 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.18, + "acc_stderr,none": 0.03861229196653691 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.21568627450980393, + "acc_stderr,none": 0.04092563958237658 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.2, + "acc_stderr,none": 0.02614881801842454 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.296551724137931, + "acc_stderr,none": 0.038061426873099886 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.2671957671957672, + "acc_stderr,none": 0.02278967314577664 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.25161290322580643, + "acc_stderr,none": 0.02468597928624002 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2955665024630542, + "acc_stderr,none": 0.03210494433751457 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.271523178807947, + "acc_stderr,none": 0.03631329803969657 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.21296296296296297, + "acc_stderr,none": 0.027920963147993683 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.24107142857142858, + "acc_stderr,none": 0.040598672469526885 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.2553767269619712, + "acc_stderr,none": 0.0036744672232731593, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2718384697130712, + "acc_stderr,none": 0.006478056366785472, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.23913743160605086, + "acc_stderr,none": 0.007641968540146066, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.23756906077348067, + "acc_stderr,none": 0.0076759851859417305, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.26419283222327944, + "acc_stderr,none": 0.007843601275189417, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756237780.356474, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10369144.931350388, + "end_time": 10370126.781975055, + "total_evaluation_time_seconds": "981.8506246674806" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T20-46-37.211472.json b/results/gemma-3-1b-pt-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T20-46-37.211472.json new file mode 100644 index 0000000000000000000000000000000000000000..1cecb0d16b6724b4204b06870477ac345217d292 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T20-46-37.211472.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7372143634385201, + "acc_stderr,none": 0.01026935406814087, + "acc_norm,none": 0.7415669205658324, + "acc_norm_stderr,none": 0.010213971636773348 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756241146.1941133, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10372511.903404668, + "end_time": 10372620.646790544, + "total_evaluation_time_seconds": "108.7433858755976" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T21-10-30.485409.json b/results/gemma-3-1b-pt-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T21-10-30.485409.json new file mode 100644 index 0000000000000000000000000000000000000000..a7fc9f410265ee25d31ee06d4ac596a3b5c401ee --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T21-10-30.485409.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3350423539901917, + "exact_match_stderr,remove_whitespace": 0.0035237031863525254 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q3_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q3_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756241298.2547183, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10372663.582597185, + "end_time": 10374053.921114726, + "total_evaluation_time_seconds": "1390.3385175410658" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-23-37.420421.json b/results/gemma-3-1b-pt-q3_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-23-37.420421.json new file mode 100644 index 0000000000000000000000000000000000000000..02ad12fcb62b868f51fe33c9d31417678f7e8e57 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-23-37.420421.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4614618601872137, + "acc_stderr,none": 0.004974937803907778, + "acc_norm,none": 0.608743278231428, + "acc_norm_stderr,none": 0.004870342592914952 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756207197.8950393, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10338562.802890183, + "end_time": 10338840.856532628, + "total_evaluation_time_seconds": "278.0536424443126" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-57-37.974497.json b/results/gemma-3-1b-pt-q3_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-57-37.974497.json new file mode 100644 index 0000000000000000000000000000000000000000..0fdf403be3d2f11031ce96ff626c1a4e58a9cfcf --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-57-37.974497.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45737900816570404, + "acc_stderr,none": 0.004971619995880016, + "acc_norm,none": 0.6138219478191596, + "acc_norm_stderr,none": 0.0048587719634691625 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 17, + 17, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756207518.121341, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10338883.798745643, + "end_time": 10340881.405883452, + "total_evaluation_time_seconds": "1997.6071378085762" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-18-15.608165.json b/results/gemma-3-1b-pt-q3_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-18-15.608165.json new file mode 100644 index 0000000000000000000000000000000000000000..a29ddde8a81d05e14549dd8eb7b83ca0603b2455 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-18-15.608165.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.25544794188861986, + "acc_stderr,none": 0.003674661313188105, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2716259298618491, + "acc_stderr,none": 0.0064765778278296155, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.26582278481012656, + "acc_stderr,none": 0.02875679962965839 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.38016528925619836, + "acc_stderr,none": 0.044313245019684325 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.23148148148148148, + "acc_stderr,none": 0.040774947092526284 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.2976878612716763, + "acc_stderr,none": 0.02461705538867703 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.3022508038585209, + "acc_stderr,none": 0.026082700695399676 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.28703703703703703, + "acc_stderr,none": 0.025171041915309698 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27053455019556716, + "acc_stderr,none": 0.011345996743539222 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.29239766081871343, + "acc_stderr,none": 0.03488647713457921 + }, + "mmlu_other": { + "acc,none": 0.23913743160605086, + "acc_stderr,none": 0.007640179707678254, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2188679245283019, + "acc_stderr,none": 0.025447863825108594 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.2543352601156069, + "acc_stderr,none": 0.03320556443085566 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.20179372197309417, + "acc_stderr,none": 0.02693611191280226 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.1941747572815534, + "acc_stderr,none": 0.03916667762822582 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.2564102564102564, + "acc_stderr,none": 0.02860595370200432 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2707535121328225, + "acc_stderr,none": 0.01588988836256046 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.25163398692810457, + "acc_stderr,none": 0.024848018263875137 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.2695035460992908, + "acc_stderr,none": 0.02646903681859067 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.16176470588235295, + "acc_stderr,none": 0.022368672562886723 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.20481927710843373, + "acc_stderr,none": 0.031417842916639266 + }, + "mmlu_social_sciences": { + "acc,none": 0.23756906077348067, + "acc_stderr,none": 0.0076759851859417305, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.24561403508771928, + "acc_stderr,none": 0.040493392977481425 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.25252525252525254, + "acc_stderr,none": 0.03095405547036587 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.22797927461139897, + "acc_stderr,none": 0.030276909945178256 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2128205128205128, + "acc_stderr,none": 0.020752423722128037 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.21008403361344538, + "acc_stderr,none": 0.026461398717471864 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.22201834862385322, + "acc_stderr,none": 0.017818849564796544 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.22900763358778625, + "acc_stderr,none": 0.036853466317118506 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.2761437908496732, + "acc_stderr,none": 0.018087276935663185 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.20909090909090908, + "acc_stderr,none": 0.03895091015724137 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.24081632653061225, + "acc_stderr,none": 0.027372942201788125 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.24378109452736318, + "acc_stderr,none": 0.03036049015401464 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_stem": { + "acc,none": 0.26482714874722485, + "acc_stderr,none": 0.007849828886272616, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04072314811876841 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.3026315789473684, + "acc_stderr,none": 0.03738520676119667 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2569444444444444, + "acc_stderr,none": 0.03653946969442102 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.18, + "acc_stderr,none": 0.03861229196653691 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.21568627450980393, + "acc_stderr,none": 0.04092563958237658 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.20425531914893616, + "acc_stderr,none": 0.02635515841334947 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.296551724137931, + "acc_stderr,none": 0.038061426873099886 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.2671957671957672, + "acc_stderr,none": 0.02278967314577664 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.25161290322580643, + "acc_stderr,none": 0.02468597928624002 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2955665024630542, + "acc_stderr,none": 0.03210494433751457 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.271523178807947, + "acc_stderr,none": 0.03631329803969657 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.21296296296296297, + "acc_stderr,none": 0.027920963147993683 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.24107142857142858, + "acc_stderr,none": 0.040598672469526885 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.25544794188861986, + "acc_stderr,none": 0.003674661313188105, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2716259298618491, + "acc_stderr,none": 0.0064765778278296155, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.23913743160605086, + "acc_stderr,none": 0.007640179707678254, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.23756906077348067, + "acc_stderr,none": 0.0076759851859417305, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.26482714874722485, + "acc_stderr,none": 0.007849828886272616, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756206173.6385045, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10337537.084415607, + "end_time": 10338519.042983947, + "total_evaluation_time_seconds": "981.9585683401674" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-00-16.738180.json b/results/gemma-3-1b-pt-q3_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-00-16.738180.json new file mode 100644 index 0000000000000000000000000000000000000000..5656bbcf3419d4a7bd86ab5a924a123e36146ddf --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-00-16.738180.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7372143634385201, + "acc_stderr,none": 0.01026935406814087, + "acc_norm,none": 0.7415669205658324, + "acc_norm_stderr,none": 0.010213971636773348 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756209560.3927722, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10340925.41148755, + "end_time": 10341040.176555607, + "total_evaluation_time_seconds": "114.76506805792451" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q3_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-24-50.975707.json b/results/gemma-3-1b-pt-q3_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-24-50.975707.json new file mode 100644 index 0000000000000000000000000000000000000000..ddce7f1cc8cb9cd15b9e16208841f0c371566741 --- /dev/null +++ b/results/gemma-3-1b-pt-q3_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T12-24-50.975707.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3355439144003567, + "exact_match_stderr,remove_whitespace": 0.0035250095379466854 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q3_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q3_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756209717.293979, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10341082.841870524, + "end_time": 10342514.413853284, + "total_evaluation_time_seconds": "1431.5719827599823" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T18-44-50.062799.json b/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T18-44-50.062799.json new file mode 100644 index 0000000000000000000000000000000000000000..7a6fc2aa4e9af6bf82ab23ee317c64d754924902 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T18-44-50.062799.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4640509858593906, + "acc_stderr,none": 0.004976867796583177, + "acc_norm,none": 0.6145190201155148, + "acc_norm_stderr,none": 0.004857140410776821 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756233676.0701602, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10365040.74355738, + "end_time": 10365313.498939076, + "total_evaluation_time_seconds": "272.7553816959262" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T19-20-45.909443.json b/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T19-20-45.909443.json new file mode 100644 index 0000000000000000000000000000000000000000..3391b0ee2341619aa5e241e5798ca91de12f32c6 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T19-20-45.909443.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.46325433180641307, + "acc_stderr,none": 0.004976288321682394, + "acc_norm,none": 0.6208922525393348, + "acc_norm_stderr,none": 0.004841734453506477 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 17, + 17, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756234118.1483746, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10365356.778950576, + "end_time": 10367469.338681001, + "total_evaluation_time_seconds": "2112.559730425477" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T18-39-32.962297.json b/results/gemma-3-1b-pt-q4_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T18-39-32.962297.json new file mode 100644 index 0000000000000000000000000000000000000000..6afced2a9c8038af8b6efdc22c1a4977d9ed8c19 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T18-39-32.962297.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.2614299957271044, + "acc_stderr,none": 0.003702573138061331, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27396386822529223, + "acc_stderr,none": 0.006490529699605641, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.26582278481012656, + "acc_stderr,none": 0.02875679962965839 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.371900826446281, + "acc_stderr,none": 0.044120158066245085 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.2037037037037037, + "acc_stderr,none": 0.038935425188248496 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.3236994219653179, + "acc_stderr,none": 0.02519018132760835 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24804469273743016, + "acc_stderr,none": 0.014444157808261434 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.3054662379421222, + "acc_stderr,none": 0.026160584450140488 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.30246913580246915, + "acc_stderr,none": 0.025557653981868003 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27053455019556716, + "acc_stderr,none": 0.011345996743539222 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.28654970760233917, + "acc_stderr,none": 0.03467826685703828 + }, + "mmlu_other": { + "acc,none": 0.2536208561313164, + "acc_stderr,none": 0.007803893677077714, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.23018867924528302, + "acc_stderr,none": 0.02590789712240816 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.26011560693641617, + "acc_stderr,none": 0.03345036916788986 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.19730941704035873, + "acc_stderr,none": 0.02670985334496796 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.17475728155339806, + "acc_stderr,none": 0.03760178006026618 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.2564102564102564, + "acc_stderr,none": 0.02860595370200432 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2669220945083014, + "acc_stderr,none": 0.015818450894777576 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.2581699346405229, + "acc_stderr,none": 0.025058503316958167 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.26595744680851063, + "acc_stderr,none": 0.026358065698880644 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.3088235294117647, + "acc_stderr,none": 0.028064998167040053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.2289156626506024, + "acc_stderr,none": 0.03270745277352474 + }, + "mmlu_social_sciences": { + "acc,none": 0.24211894702632433, + "acc_stderr,none": 0.007717481128456627, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.2543859649122807, + "acc_stderr,none": 0.04096985139843672 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.26262626262626265, + "acc_stderr,none": 0.031353050095330834 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.22797927461139897, + "acc_stderr,none": 0.030276909945178256 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.21025641025641026, + "acc_stderr,none": 0.020660597485026883 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.21008403361344538, + "acc_stderr,none": 0.026461398717471864 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.21651376146788992, + "acc_stderr,none": 0.017658710594443204 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.28431372549019607, + "acc_stderr,none": 0.01824902441120767 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.23636363636363636, + "acc_stderr,none": 0.04069306319721375 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.24897959183673468, + "acc_stderr,none": 0.027682979522960276 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.22388059701492538, + "acc_stderr,none": 0.029475250236017162 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_stem": { + "acc,none": 0.269267364414843, + "acc_stderr,none": 0.007889132269663172, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.35, + "acc_stderr,none": 0.04793724854411023 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.28888888888888886, + "acc_stderr,none": 0.0391545063041425 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.32894736842105265, + "acc_stderr,none": 0.03823428969926598 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2638888888888889, + "acc_stderr,none": 0.03685651095897531 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.19, + "acc_stderr,none": 0.039427724440366255 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.23529411764705882, + "acc_stderr,none": 0.04220773659171447 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.17872340425531916, + "acc_stderr,none": 0.025045373272050923 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2689655172413793, + "acc_stderr,none": 0.03695183311650232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.2671957671957672, + "acc_stderr,none": 0.02278967314577664 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.25161290322580643, + "acc_stderr,none": 0.02468597928624002 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.30049261083743845, + "acc_stderr,none": 0.03225799476233486 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26666666666666666, + "acc_stderr,none": 0.026962424325073904 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.25165562913907286, + "acc_stderr,none": 0.03543304234389988 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.3194444444444444, + "acc_stderr,none": 0.03179876342176851 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.24107142857142858, + "acc_stderr,none": 0.040598672469526885 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.2614299957271044, + "acc_stderr,none": 0.003702573138061331, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27396386822529223, + "acc_stderr,none": 0.006490529699605641, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.2536208561313164, + "acc_stderr,none": 0.007803893677077714, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.24211894702632433, + "acc_stderr,none": 0.007717481128456627, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.269267364414843, + "acc_stderr,none": 0.007889132269663172, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756232654.6878765, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10364019.560758956, + "end_time": 10364996.400405727, + "total_evaluation_time_seconds": "976.8396467715502" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T19-23-19.234939.json b/results/gemma-3-1b-pt-q4_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T19-23-19.234939.json new file mode 100644 index 0000000000000000000000000000000000000000..980dfdb8cdbba8681cb1466f7bd728436d0a6765 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T19-23-19.234939.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7421109902067464, + "acc_stderr,none": 0.010206956662056201, + "acc_norm,none": 0.7464635473340587, + "acc_norm_stderr,none": 0.010150090834551817 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756236147.3807654, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10367512.699529504, + "end_time": 10367622.673312971, + "total_evaluation_time_seconds": "109.97378346696496" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T19-47-21.865123.json b/results/gemma-3-1b-pt-q4_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T19-47-21.865123.json new file mode 100644 index 0000000000000000000000000000000000000000..208482d8b9a504d15e9815e622d5c8604fc83008 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T19-47-21.865123.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.34490637539010255, + "exact_match_stderr,remove_whitespace": 0.0035485813761982864 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q4_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q4_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756236300.3162215, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10367665.693355573, + "end_time": 10369065.303484928, + "total_evaluation_time_seconds": "1399.6101293545216" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-00-04.167957.json b/results/gemma-3-1b-pt-q4_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-00-04.167957.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f534d088cd40b733287b47086d766b2eb4fe31 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-00-04.167957.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.46554471220872334, + "acc_stderr,none": 0.004977919906875265, + "acc_norm,none": 0.6160127464648476, + "acc_norm_stderr,none": 0.004853608805843713 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756202133.0889144, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10333496.188994976, + "end_time": 10333827.604686547, + "total_evaluation_time_seconds": "331.41569157131016" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-34-47.776962.json b/results/gemma-3-1b-pt-q4_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-34-47.776962.json new file mode 100644 index 0000000000000000000000000000000000000000..db0f646305d4e17e7956c10f2be741d627703ac9 --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-34-47.776962.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.46683927504481176, + "acc_stderr,none": 0.004978795454216555, + "acc_norm,none": 0.6236805417247561, + "acc_norm_stderr,none": 0.0048347158142077054 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 15, + 19, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756202509.090127, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10333872.521076232, + "end_time": 10335911.204941303, + "total_evaluation_time_seconds": "2038.683865070343" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T09-53-47.483830.json b/results/gemma-3-1b-pt-q4_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T09-53-47.483830.json new file mode 100644 index 0000000000000000000000000000000000000000..6cd9d476edc28d8b989990af42d035cf93c949ba --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T09-53-47.483830.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.26114513602050987, + "acc_stderr,none": 0.0037017347311293605, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2733262486716259, + "acc_stderr,none": 0.006488330447486154, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.270042194092827, + "acc_stderr,none": 0.02890072190629346 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.371900826446281, + "acc_stderr,none": 0.044120158066245085 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.23148148148148148, + "acc_stderr,none": 0.040774947092526284 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.3179190751445087, + "acc_stderr,none": 0.025070713719153193 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24804469273743016, + "acc_stderr,none": 0.014444157808261434 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.3022508038585209, + "acc_stderr,none": 0.026082700695399676 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.2962962962962963, + "acc_stderr,none": 0.0254071977988902 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27053455019556716, + "acc_stderr,none": 0.011345996743539222 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.27485380116959063, + "acc_stderr,none": 0.03424042924691578 + }, + "mmlu_other": { + "acc,none": 0.25555197940135177, + "acc_stderr,none": 0.007823911280330501, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2339622641509434, + "acc_stderr,none": 0.02605529690115286 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.26011560693641617, + "acc_stderr,none": 0.03345036916788986 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.19730941704035873, + "acc_stderr,none": 0.02670985334496796 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.17475728155339806, + "acc_stderr,none": 0.03760178006026618 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.2564102564102564, + "acc_stderr,none": 0.02860595370200432 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2681992337164751, + "acc_stderr,none": 0.015842430835269407 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.26143790849673204, + "acc_stderr,none": 0.025160998214292445 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.26595744680851063, + "acc_stderr,none": 0.026358065698880644 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.3088235294117647, + "acc_stderr,none": 0.028064998167040053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.26506024096385544, + "acc_stderr,none": 0.034360240379449694 + }, + "mmlu_social_sciences": { + "acc,none": 0.24049398765030874, + "acc_stderr,none": 0.007701018846448633, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.2543859649122807, + "acc_stderr,none": 0.04096985139843672 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.25757575757575757, + "acc_stderr,none": 0.031156269519646826 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.23316062176165803, + "acc_stderr,none": 0.03051611137147603 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.21025641025641026, + "acc_stderr,none": 0.020660597485026883 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.21008403361344538, + "acc_stderr,none": 0.026461398717471864 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.21467889908256882, + "acc_stderr,none": 0.01760430414925655 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.28104575163398693, + "acc_stderr,none": 0.018185218954318044 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2636363636363636, + "acc_stderr,none": 0.04220224692971989 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.24081632653061225, + "acc_stderr,none": 0.027372942201788125 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.21393034825870647, + "acc_stderr,none": 0.028996909693328965 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_stem": { + "acc,none": 0.26863304789089754, + "acc_stderr,none": 0.007881416505635256, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.3037037037037037, + "acc_stderr,none": 0.03972552884785133 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.29605263157894735, + "acc_stderr,none": 0.037150621549989084 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2638888888888889, + "acc_stderr,none": 0.03685651095897531 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.19, + "acc_stderr,none": 0.039427724440366255 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.23529411764705882, + "acc_stderr,none": 0.04220773659171447 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.17872340425531916, + "acc_stderr,none": 0.025045373272050923 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2827586206896552, + "acc_stderr,none": 0.03752833958003335 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.2671957671957672, + "acc_stderr,none": 0.02278967314577664 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.24516129032258063, + "acc_stderr,none": 0.024472243840895462 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.30049261083743845, + "acc_stderr,none": 0.03225799476233486 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.35, + "acc_stderr,none": 0.04793724854411023 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.271523178807947, + "acc_stderr,none": 0.03631329803969657 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.26851851851851855, + "acc_stderr,none": 0.030225226160012435 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.24107142857142858, + "acc_stderr,none": 0.040598672469526885 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.26114513602050987, + "acc_stderr,none": 0.0037017347311293605, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2733262486716259, + "acc_stderr,none": 0.006488330447486154, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.25555197940135177, + "acc_stderr,none": 0.007823911280330501, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.24049398765030874, + "acc_stderr,none": 0.007701018846448633, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.26863304789089754, + "acc_stderr,none": 0.007881416505635256, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756201100.9444952, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10332464.762157124, + "end_time": 10333450.919544121, + "total_evaluation_time_seconds": "986.1573869977146" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-37-24.805509.json b/results/gemma-3-1b-pt-q4_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-37-24.805509.json new file mode 100644 index 0000000000000000000000000000000000000000..513dff3999986646fd34f13c803259ee2c02e4ef --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T10-37-24.805509.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7404787812840044, + "acc_stderr,none": 0.010227939888174076, + "acc_norm,none": 0.7448313384113167, + "acc_norm_stderr,none": 0.010171571592521887 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756204589.929225, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10335954.78639334, + "end_time": 10336068.2430083, + "total_evaluation_time_seconds": "113.45661495998502" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q4_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-01-05.515541.json b/results/gemma-3-1b-pt-q4_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-01-05.515541.json new file mode 100644 index 0000000000000000000000000000000000000000..85b579dadc10f5c8ac28b8be5b4789ee158cdedf --- /dev/null +++ b/results/gemma-3-1b-pt-q4_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T11-01-05.515541.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3492532322781988, + "exact_match_stderr,remove_whitespace": 0.0035590058209197333 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q4_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q4_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756204746.1531723, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10336111.31547239, + "end_time": 10337488.953307116, + "total_evaluation_time_seconds": "1377.6378347259015" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T12-48-45.405550.json b/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T12-48-45.405550.json new file mode 100644 index 0000000000000000000000000000000000000000..e1b9a52466da6f238e46bd3c87184d463c069e22 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T12-48-45.405550.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4690300736904999, + "acc_stderr,none": 0.004980200451851498, + "acc_norm,none": 0.6174068910575583, + "acc_norm_stderr,none": 0.004850268986903106 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756212308.6486008, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10343674.172472687, + "end_time": 10343948.842254344, + "total_evaluation_time_seconds": "274.6697816569358" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T13-22-33.247468.json b/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T13-22-33.247468.json new file mode 100644 index 0000000000000000000000000000000000000000..fdbba9351d4fa1156ce3c0e4bde04411eb9d9567 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T13-22-33.247468.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4682334196375224, + "acc_stderr,none": 0.004979700695747546, + "acc_norm,none": 0.622087233618801, + "acc_norm_stderr,none": 0.004838747305783286 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 17, + 17, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756212626.8379686, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10343992.036112672, + "end_time": 10345976.675650535, + "total_evaluation_time_seconds": "1984.6395378634334" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T12-43-27.880531.json b/results/gemma-3-1b-pt-q5_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T12-43-27.880531.json new file mode 100644 index 0000000000000000000000000000000000000000..7de5cb84cf79576d707471341a525193e09f6f86 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T12-43-27.880531.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.26912120780515597, + "acc_stderr,none": 0.0037312548293410037, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27481402763018065, + "acc_stderr,none": 0.006492958113495865, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.2787878787878788, + "acc_stderr,none": 0.03501438706296781 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.2647058823529412, + "acc_stderr,none": 0.030964517926923372 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.2489451476793249, + "acc_stderr,none": 0.028146970599422644 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.4049586776859504, + "acc_stderr,none": 0.04481137755942466 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.21296296296296297, + "acc_stderr,none": 0.039578354719809784 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.3179190751445087, + "acc_stderr,none": 0.025070713719153193 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.31189710610932475, + "acc_stderr,none": 0.026311858071854183 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.3117283950617284, + "acc_stderr,none": 0.025773111169630415 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27053455019556716, + "acc_stderr,none": 0.011345996743539222 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.29239766081871343, + "acc_stderr,none": 0.03488647713457921 + }, + "mmlu_other": { + "acc,none": 0.2706791116832958, + "acc_stderr,none": 0.007946546211766235, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.21, + "acc_stderr,none": 0.040936018074033236 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.25660377358490566, + "acc_stderr,none": 0.026880647889052034 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.2658959537572254, + "acc_stderr,none": 0.033687629322594316 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.19730941704035873, + "acc_stderr,none": 0.02670985334496796 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.2524271844660194, + "acc_stderr,none": 0.04301250399690879 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.26495726495726496, + "acc_stderr,none": 0.02891120880274955 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2681992337164751, + "acc_stderr,none": 0.015842430835269407 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.27124183006535946, + "acc_stderr,none": 0.02545775669666782 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.2624113475177305, + "acc_stderr,none": 0.02624492034984306 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4007352941176471, + "acc_stderr,none": 0.02976826352893308 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.26506024096385544, + "acc_stderr,none": 0.034360240379449694 + }, + "mmlu_social_sciences": { + "acc,none": 0.2430939226519337, + "acc_stderr,none": 0.007735863489554942, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.2543859649122807, + "acc_stderr,none": 0.04096985139843672 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.2676767676767677, + "acc_stderr,none": 0.031544498882702825 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.22279792746113988, + "acc_stderr,none": 0.030031147977641528 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2153846153846154, + "acc_stderr,none": 0.02084303455746285 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.226890756302521, + "acc_stderr,none": 0.027205371538279483 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.22568807339449543, + "acc_stderr,none": 0.01792308766780296 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.2630718954248366, + "acc_stderr,none": 0.017812676542320723 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2545454545454545, + "acc_stderr,none": 0.04172343038705381 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.24897959183673468, + "acc_stderr,none": 0.027682979522960276 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.23383084577114427, + "acc_stderr,none": 0.02992941540834832 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_stem": { + "acc,none": 0.2844909609895338, + "acc_stderr,none": 0.007996769080081133, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.2814814814814815, + "acc_stderr,none": 0.03885004245800249 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.40131578947368424, + "acc_stderr,none": 0.039889037033362836 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2847222222222222, + "acc_stderr,none": 0.037738099906869334 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.22549019607843138, + "acc_stderr,none": 0.04158307533083289 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.19148936170212766, + "acc_stderr,none": 0.025722149992637826 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.296551724137931, + "acc_stderr,none": 0.038061426873099886 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.26455026455026454, + "acc_stderr,none": 0.022717467897708593 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.25483870967741934, + "acc_stderr,none": 0.024790118459332256 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.29064039408866993, + "acc_stderr,none": 0.031947400722655415 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.2847682119205298, + "acc_stderr,none": 0.03684881521389021 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4166666666666667, + "acc_stderr,none": 0.03362277436608037 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.24107142857142858, + "acc_stderr,none": 0.040598672469526885 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.26912120780515597, + "acc_stderr,none": 0.0037312548293410037, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27481402763018065, + "acc_stderr,none": 0.006492958113495865, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.2706791116832958, + "acc_stderr,none": 0.007946546211766235, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.2430939226519337, + "acc_stderr,none": 0.007735863489554942, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.2844909609895338, + "acc_stderr,none": 0.007996769080081133, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756211195.0512605, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10342560.383471975, + "end_time": 10343631.313272536, + "total_evaluation_time_seconds": "1070.929800560698" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T13-25-06.984222.json b/results/gemma-3-1b-pt-q5_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T13-25-06.984222.json new file mode 100644 index 0000000000000000000000000000000000000000..559a12e1e60362d2cbf18c217e4f2ea087cb1f06 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T13-25-06.984222.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7486398258977149, + "acc_stderr,none": 0.010121156016819219, + "acc_norm,none": 0.7464635473340587, + "acc_norm_stderr,none": 0.010150090834551817 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756214655.268706, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10346020.253014293, + "end_time": 10346130.420203028, + "total_evaluation_time_seconds": "110.16718873567879" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T13-48-42.483459.json b/results/gemma-3-1b-pt-q5_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T13-48-42.483459.json new file mode 100644 index 0000000000000000000000000000000000000000..77b38beb423f9e660429b390055786ff48e7b7f5 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T13-48-42.483459.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3484172982612572, + "exact_match_stderr,remove_whitespace": 0.003557026484971732 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "gemma-3-1b-pt-q5_k_m-dc-b10.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=gemma-3-1b-pt-q5_k_m-dc-b10.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756214808.5410202, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10346173.579368023, + "end_time": 10347545.92164026, + "total_evaluation_time_seconds": "1372.3422722369432" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-06-27.477647.json b/results/gemma-3-1b-pt-q5_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-06-27.477647.json new file mode 100644 index 0000000000000000000000000000000000000000..b75a1790a4f025f0e6eaddcb4ae0db88f53397e0 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m/hellaswag-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-06-27.477647.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.46873132842063336, + "acc_stderr,none": 0.004980014536540145, + "acc_norm,none": 0.6190997809201354, + "acc_norm_stderr,none": 0.004846156699486519 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756180961.7683156, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10312326.432171715, + "end_time": 10312610.914023504, + "total_evaluation_time_seconds": "284.4818517882377" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-40-40.091612.json b/results/gemma-3-1b-pt-q5_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-40-40.091612.json new file mode 100644 index 0000000000000000000000000000000000000000..31beab3700067595ca39c84ffe739ddffb65f20d --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m/hellaswag-10/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-40-40.091612.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.47032463652658835, + "acc_stderr,none": 0.004980985384152799, + "acc_norm,none": 0.6263692491535551, + "acc_norm_stderr,none": 0.004827786289074885 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 17, + 17, + 19, + 19 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756181291.4446435, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10312655.172927069, + "end_time": 10314663.518843023, + "total_evaluation_time_seconds": "2008.34591595456" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-00-57.878146.json b/results/gemma-3-1b-pt-q5_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-00-57.878146.json new file mode 100644 index 0000000000000000000000000000000000000000..8f9c1943b667dbdb2a5989b04724928baa2516e6 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m/mmlu-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-00-57.878146.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.2676969092721834, + "acc_stderr,none": 0.0037260275435773396, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27481402763018065, + "acc_stderr,none": 0.006499157393703688, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.15079365079365079, + "acc_stderr,none": 0.03200686497287397 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.24050632911392406, + "acc_stderr,none": 0.02782078198114972 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.3305785123966942, + "acc_stderr,none": 0.042943408452120926 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.21296296296296297, + "acc_stderr,none": 0.039578354719809784 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3006134969325153, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.3179190751445087, + "acc_stderr,none": 0.025070713719153193 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2558659217877095, + "acc_stderr,none": 0.014593620923210763 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.3054662379421222, + "acc_stderr,none": 0.026160584450140488 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.31790123456790126, + "acc_stderr,none": 0.02591006352824088 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.2711864406779661, + "acc_stderr,none": 0.01135458145162299 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.30409356725146197, + "acc_stderr,none": 0.03528211258245229 + }, + "mmlu_other": { + "acc,none": 0.2716446733183135, + "acc_stderr,none": 0.007946027036692364, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.24528301886792453, + "acc_stderr,none": 0.026480357179895737 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.2658959537572254, + "acc_stderr,none": 0.033687629322594316 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.23, + "acc_stderr,none": 0.04229525846816507 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.19730941704035873, + "acc_stderr,none": 0.02670985334496796 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.21359223300970873, + "acc_stderr,none": 0.04058042015646035 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.26495726495726496, + "acc_stderr,none": 0.02891120880274955 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.2656449553001277, + "acc_stderr,none": 0.015794302487888694 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.2777777777777778, + "acc_stderr,none": 0.025646863097137932 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.2624113475177305, + "acc_stderr,none": 0.02624492034984306 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4117647058823529, + "acc_stderr,none": 0.02989616303312549 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.29518072289156627, + "acc_stderr,none": 0.035509201856896336 + }, + "mmlu_social_sciences": { + "acc,none": 0.24374390640233995, + "acc_stderr,none": 0.00773655532154967, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.2543859649122807, + "acc_stderr,none": 0.04096985139843672 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.26262626262626265, + "acc_stderr,none": 0.031353050095330834 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.21761658031088082, + "acc_stderr,none": 0.029778663037752975 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2153846153846154, + "acc_stderr,none": 0.02084303455746285 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.22268907563025211, + "acc_stderr,none": 0.027025433498882392 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.21467889908256882, + "acc_stderr,none": 0.01760430414925655 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.2777777777777778, + "acc_stderr,none": 0.018120224251484508 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2727272727272727, + "acc_stderr,none": 0.04265792110940591 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.2530612244897959, + "acc_stderr,none": 0.027833023871399704 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.22885572139303484, + "acc_stderr,none": 0.029705284056772495 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_stem": { + "acc,none": 0.27656200444021567, + "acc_stderr,none": 0.00793671883968685, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.2962962962962963, + "acc_stderr,none": 0.03944624162501111 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.3157894736842105, + "acc_stderr,none": 0.03782728980865462 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2777777777777778, + "acc_stderr,none": 0.037455547914624555 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.22549019607843138, + "acc_stderr,none": 0.04158307533083289 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.17446808510638298, + "acc_stderr,none": 0.024809442335503955 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2689655172413793, + "acc_stderr,none": 0.03695183311650232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.022569897074918445 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.2645161290322581, + "acc_stderr,none": 0.02509189237885932 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2955665024630542, + "acc_stderr,none": 0.03210494433751457 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.271523178807947, + "acc_stderr,none": 0.03631329803969657 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.39351851851851855, + "acc_stderr,none": 0.033317478763703126 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.23214285714285715, + "acc_stderr,none": 0.04007341809755807 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.2676969092721834, + "acc_stderr,none": 0.0037260275435773396, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.27481402763018065, + "acc_stderr,none": 0.006499157393703688, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.2716446733183135, + "acc_stderr,none": 0.007946027036692364, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.24374390640233995, + "acc_stderr,none": 0.00773655532154967, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.27656200444021567, + "acc_stderr,none": 0.00793671883968685, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 7, + 19, + 32, + 32, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756179896.6109464, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10311261.27191683, + "end_time": 10312281.313350204, + "total_evaluation_time_seconds": "1020.0414333734661" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-43-18.778488.json b/results/gemma-3-1b-pt-q5_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-43-18.778488.json new file mode 100644 index 0000000000000000000000000000000000000000..87195eeb34134a58d2274b615e1a435d846ddec4 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m/piqa-0/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T04-43-18.778488.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.749727965179543, + "acc_stderr,none": 0.01010656188008975, + "acc_norm,none": 0.7453754080522307, + "acc_norm_stderr,none": 0.010164432237060617 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756183342.0299957, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10314707.323286947, + "end_time": 10314822.215488749, + "total_evaluation_time_seconds": "114.89220180176198" +} \ No newline at end of file diff --git a/results/gemma-3-1b-pt-q5_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T05-08-03.555978.json b/results/gemma-3-1b-pt-q5_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T05-08-03.555978.json new file mode 100644 index 0000000000000000000000000000000000000000..296121fff7c23bae91caa073a60a0fb3f8af1e71 --- /dev/null +++ b/results/gemma-3-1b-pt-q5_k_m/triviaqa-5/skymizer__gemma-3-1b-pt-GGUF/results_2025-08-26T05-08-03.555978.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.35298707088720466, + "exact_match_stderr,remove_whitespace": 0.003567700179654136 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/gemma-3-1b-pt-GGUF", + "gguf_file": "gemma-3-1b-pt-q5_k_m.gguf", + "tokenizer": "google/gemma-3-1b-pt" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/gemma-3-1b-pt-GGUF,gguf_file=gemma-3-1b-pt-q5_k_m.gguf,tokenizer=google/gemma-3-1b-pt", + "model_num_parameters": 999885952, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "debe2478e8ef0525db3391d4b90bddbea8b20670", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756183500.2287133, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "1" + ], + "tokenizer_bos_token": [ + "", + "2" + ], + "eot_token_id": 1, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/gemma-3-1b-pt-GGUF", + "model_name_sanitized": "skymizer__gemma-3-1b-pt-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10314865.389951872, + "end_time": 10316306.994069807, + "total_evaluation_time_seconds": "1441.6041179355234" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T04-51-36.078167.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T04-51-36.078167.json new file mode 100644 index 0000000000000000000000000000000000000000..864bd9f152221ddc1820bbf7472ce8bd310a1eac --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T04-51-36.078167.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.27232142857142855, + "acc_stderr,none": 0.02105508212932411, + "acc_norm,none": 0.27232142857142855, + "acc_norm_stderr,none": 0.02105508212932411 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 9, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756442725.230191, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6788353.722110151, + "end_time": 6788886.198874184, + "total_evaluation_time_seconds": "532.4767640326172" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-28T23-45-08.122514.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-28T23-45-08.122514.json new file mode 100644 index 0000000000000000000000000000000000000000..55757a19d5a137254874721d29346adf6a21ea9b --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-28T23-45-08.122514.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5750846444931289, + "acc_stderr,none": 0.00493319877670009, + "acc_norm,none": 0.734017128062139, + "acc_norm_stderr,none": 0.004409521343139737 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756423105.123428, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6768754.425889828, + "end_time": 6770498.242228656, + "total_evaluation_time_seconds": "1743.8163388278335" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T03-38-59.589582.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T03-38-59.589582.json new file mode 100644 index 0000000000000000000000000000000000000000..eff46f7ce31734aa72afe8f7db926a6f73c10627 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T03-38-59.589582.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5942043417645887, + "acc_stderr,none": 0.004900417982582057, + "acc_norm,none": 0.7797251543517227, + "acc_norm_stderr,none": 0.004135849642817268 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 19, + 19, + 22, + 22 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756424948.985949, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6770604.923830719, + "end_time": 6784529.707175097, + "total_evaluation_time_seconds": "13924.783344378695" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T13-45-33.534718.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T13-45-33.534718.json new file mode 100644 index 0000000000000000000000000000000000000000..56ff680670d8265c876b898fa43c73590163d765 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T13-45-33.534718.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.6987060998151571, + "prompt_level_strict_acc_stderr,none": 0.019744473483514356, + "inst_level_strict_acc,none": 0.7817745803357314, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.7412199630314233, + "prompt_level_loose_acc_stderr,none": 0.018846992560712525, + "inst_level_loose_acc,none": 0.8141486810551559, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756471974.2640414, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6817664.029671304, + "end_time": 6820923.655691498, + "total_evaluation_time_seconds": "3259.6260201940313" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-28T23-14-12.467699.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-28T23-14-12.467699.json new file mode 100644 index 0000000000000000000000000000000000000000..45f9dfe3c67e87a324ccd186987671d5c3d92592 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-28T23-14-12.467699.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.6650049850448654, + "acc_stderr,none": 0.003808060763406726, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6276301806588735, + "acc_stderr,none": 0.006799477656917312, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5317460317460317, + "acc_stderr,none": 0.04463112720677173 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7515151515151515, + "acc_stderr,none": 0.03374402644139407 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.7843137254901961, + "acc_stderr,none": 0.028867431449849337 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8227848101265823, + "acc_stderr,none": 0.024856364184503203 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8099173553719008, + "acc_stderr,none": 0.03581796951709283 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7685185185185185, + "acc_stderr,none": 0.040774947092526284 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.8098159509202454, + "acc_stderr,none": 0.03083349114628123 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7341040462427746, + "acc_stderr,none": 0.02378620325550823 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.5396648044692738, + "acc_stderr,none": 0.016669799592112056 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7266881028938906, + "acc_stderr,none": 0.025311765975426094 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.7283950617283951, + "acc_stderr,none": 0.024748624490537455 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.4941329856584094, + "acc_stderr,none": 0.012769356925216389 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8011695906432749, + "acc_stderr,none": 0.030611116557432514 + }, + "mmlu_other": { + "acc,none": 0.722240102993241, + "acc_stderr,none": 0.007734304689425906, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.75, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7320754716981132, + "acc_stderr,none": 0.027257260322494828 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7052023121387283, + "acc_stderr,none": 0.03476599607516476 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.672645739910314, + "acc_stderr,none": 0.0314938467099413 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7961165048543689, + "acc_stderr,none": 0.0398913985953177 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8632478632478633, + "acc_stderr,none": 0.022509033937077844 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.78, + "acc_stderr,none": 0.041633319989322654 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8314176245210728, + "acc_stderr,none": 0.013387895731543545 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7483660130718954, + "acc_stderr,none": 0.024848018263875137 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5212765957446809, + "acc_stderr,none": 0.02980048164562865 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7169117647058824, + "acc_stderr,none": 0.027365861131513815 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4939759036144578, + "acc_stderr,none": 0.03892212195333041 + }, + "mmlu_social_sciences": { + "acc,none": 0.7556061098472538, + "acc_stderr,none": 0.007632947978764084, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.5789473684210527, + "acc_stderr,none": 0.04644602091222323 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8232323232323232, + "acc_stderr,none": 0.027178752639044908 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8497409326424871, + "acc_stderr,none": 0.025787723180723855 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6666666666666666, + "acc_stderr,none": 0.023901157979402492 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7394957983193278, + "acc_stderr,none": 0.028510251512341933 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8495412844036697, + "acc_stderr,none": 0.015328563932669235 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7938931297709924, + "acc_stderr,none": 0.035477710041594654 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.6977124183006536, + "acc_stderr,none": 0.018579232711113967 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6818181818181818, + "acc_stderr,none": 0.044612721759105065 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.726530612244898, + "acc_stderr,none": 0.028535560337128486 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8208955223880597, + "acc_stderr,none": 0.027113286753111865 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.84, + "acc_stderr,none": 0.03684529491774706 + }, + "mmlu_stem": { + "acc,none": 0.5759594037424675, + "acc_stderr,none": 0.008432144161387883, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6666666666666666, + "acc_stderr,none": 0.04072314811876841 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7368421052631579, + "acc_stderr,none": 0.03583496176361067 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.7777777777777778, + "acc_stderr,none": 0.03476590104304133 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.45, + "acc_stderr,none": 0.05 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.52, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4411764705882353, + "acc_stderr,none": 0.049406356306056644 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6340425531914894, + "acc_stderr,none": 0.03148955829745524 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6551724137931034, + "acc_stderr,none": 0.03960933549451213 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.4708994708994709, + "acc_stderr,none": 0.02570765861415493 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8032258064516129, + "acc_stderr,none": 0.02261640942074208 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5665024630541872, + "acc_stderr,none": 0.034867317274198714 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.74, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.3814814814814815, + "acc_stderr,none": 0.029616718927497565 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.48344370860927155, + "acc_stderr,none": 0.0408024418562897 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.03388857118502326 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.49107142857142855, + "acc_stderr,none": 0.04745033255489126 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6650049850448654, + "acc_stderr,none": 0.003808060763406726, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6276301806588735, + "acc_stderr,none": 0.006799477656917312, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.722240102993241, + "acc_stderr,none": 0.007734304689425906, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.7556061098472538, + "acc_stderr,none": 0.007632947978764084, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.5759594037424675, + "acc_stderr,none": 0.008432144161387883, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756418342.3080492, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6764012.428708822, + "end_time": 6768642.58793581, + "total_evaluation_time_seconds": "4630.159226988442" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T03-45-45.219594.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T03-45-45.219594.json new file mode 100644 index 0000000000000000000000000000000000000000..aed33b87d9948e2b67b0d7c08bac5f658854c664 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T03-45-45.219594.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.795429815016322, + "acc_stderr,none": 0.009411688039193577, + "acc_norm,none": 0.794885745375408, + "acc_norm_stderr,none": 0.009420971671018023 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756438945.0546112, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6784623.486849175, + "end_time": 6784935.339727577, + "total_evaluation_time_seconds": "311.85287840198725" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T04-41-01.642818.json b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T04-41-01.642818.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e51a45e8e2619fbd4e2ff81288bd89bf7a7ee2 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T04-41-01.642818.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.5720575122603656, + "exact_match_stderr,remove_whitespace": 0.0036937289351404315 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.1-8b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756439418.136281, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6785063.112292615, + "end_time": 6788251.762945544, + "total_evaluation_time_seconds": "3188.650652929209" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-19-24.887264.json b/results/llama-3.1-8b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-19-24.887264.json new file mode 100644 index 0000000000000000000000000000000000000000..b41c6e3034d76b28f6ce77bb3a0d2537c3fa8221 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-19-24.887264.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.28125, + "acc_stderr,none": 0.021265785688273954, + "acc_norm,none": 0.28125, + "acc_norm_stderr,none": 0.021265785688273954 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [ + 9, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756397596.8580039, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6743314.347175269, + "end_time": 6743755.008235267, + "total_evaluation_time_seconds": "440.6610599979758" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T11-17-57.196185.json b/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T11-17-57.196185.json new file mode 100644 index 0000000000000000000000000000000000000000..b0e6e02846054b001d16b0e49423a31a77b0981c --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T11-17-57.196185.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5762796255725952, + "acc_stderr,none": 0.0049313726571298755, + "acc_norm,none": 0.7341167098187612, + "acc_norm_stderr,none": 0.00440899486864994 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756378309.7240536, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6724033.028550806, + "end_time": 6725667.316656574, + "total_evaluation_time_seconds": "1634.2881057672203" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-11-18.604003.json b/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-11-18.604003.json new file mode 100644 index 0000000000000000000000000000000000000000..af91563641587a5f634cffc0dd16456d35411afc --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-11-18.604003.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5954989046006771, + "acc_stderr,none": 0.004897921845492068, + "acc_norm,none": 0.780920135431189, + "acc_norm_stderr,none": 0.004127775403148651 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [ + 19, + 19, + 22, + 22 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756379994.0238812, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6725716.583365066, + "end_time": 6739668.724583514, + "total_evaluation_time_seconds": "13952.141218448058" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/ifeval/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-29T11-43-19.960215.json b/results/llama-3.1-8b-instruct-q3_k_m/ifeval/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-29T11-43-19.960215.json new file mode 100644 index 0000000000000000000000000000000000000000..49eebecbd4699096193aa1441edd5026bdd0b7e4 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/ifeval/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-29T11-43-19.960215.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.711645101663586, + "prompt_level_strict_acc_stderr,none": 0.019493890350654804, + "inst_level_strict_acc,none": 0.790167865707434, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.7597042513863216, + "prompt_level_loose_acc_stderr,none": 0.018386473581487088, + "inst_level_loose_acc,none": 0.8237410071942446, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756464684.2343795, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6810343.246271214, + "end_time": 6813590.080971116, + "total_evaluation_time_seconds": "3246.834699901752" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T10-49-52.307915.json b/results/llama-3.1-8b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T10-49-52.307915.json new file mode 100644 index 0000000000000000000000000000000000000000..3d013d5e1570200db576cf9562bb7e69a3137558 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T10-49-52.307915.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.6623700327588663, + "acc_stderr,none": 0.0038108137883870693, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6259298618490967, + "acc_stderr,none": 0.006790849423879243, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5, + "acc_stderr,none": 0.04472135954999579 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7575757575757576, + "acc_stderr,none": 0.03346409881055956 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.7892156862745098, + "acc_stderr,none": 0.028626547912437423 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8185654008438819, + "acc_stderr,none": 0.025085961144579647 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8016528925619835, + "acc_stderr,none": 0.03640118271990949 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7870370370370371, + "acc_stderr,none": 0.03957835471980979 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.803680981595092, + "acc_stderr,none": 0.031207970394709197 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7485549132947977, + "acc_stderr,none": 0.023357365785874006 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.5307262569832403, + "acc_stderr,none": 0.016690896161944517 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7202572347266881, + "acc_stderr,none": 0.02549425935069486 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.7253086419753086, + "acc_stderr,none": 0.024836057868294684 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.49282920469361147, + "acc_stderr,none": 0.012768922739553434 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8187134502923976, + "acc_stderr,none": 0.02954774168764006 + }, + "mmlu_other": { + "acc,none": 0.7177341486964918, + "acc_stderr,none": 0.00777083586917588, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.75, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7320754716981132, + "acc_stderr,none": 0.027257260322494828 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6705202312138728, + "acc_stderr,none": 0.03583901754736415 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6771300448430493, + "acc_stderr,none": 0.031381476375754995 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7864077669902912, + "acc_stderr,none": 0.04058042015646035 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8504273504273504, + "acc_stderr,none": 0.023365051491753757 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8288633461047255, + "acc_stderr,none": 0.013468201614066377 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7418300653594772, + "acc_stderr,none": 0.025058503316958167 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5070921985815603, + "acc_stderr,none": 0.029824498559128977 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7205882352941176, + "acc_stderr,none": 0.02725720260611497 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4939759036144578, + "acc_stderr,none": 0.03892212195333041 + }, + "mmlu_social_sciences": { + "acc,none": 0.7543061423464413, + "acc_stderr,none": 0.007629481064864609, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.5789473684210527, + "acc_stderr,none": 0.04644602091222323 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8232323232323232, + "acc_stderr,none": 0.027178752639044908 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8756476683937824, + "acc_stderr,none": 0.02381447708659357 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6615384615384615, + "acc_stderr,none": 0.023991500500313064 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.726890756302521, + "acc_stderr,none": 0.0289420040409982 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8532110091743119, + "acc_stderr,none": 0.015173141845126333 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7862595419847328, + "acc_stderr,none": 0.03595461611774691 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.6911764705882353, + "acc_stderr,none": 0.01869085027359537 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6636363636363637, + "acc_stderr,none": 0.04525393596302509 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7346938775510204, + "acc_stderr,none": 0.028263889943784655 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8109452736318408, + "acc_stderr,none": 0.02768691358801299 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.85, + "acc_stderr,none": 0.03588702812826367 + }, + "mmlu_stem": { + "acc,none": 0.5724706628607675, + "acc_stderr,none": 0.008442708678734078, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.674074074074074, + "acc_stderr,none": 0.040491220417025006 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7368421052631579, + "acc_stderr,none": 0.03583496176361067 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.7847222222222222, + "acc_stderr,none": 0.034370793441061386 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.5, + "acc_stderr,none": 0.050251890762960605 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.52, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.35, + "acc_stderr,none": 0.04793724854411023 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4215686274509804, + "acc_stderr,none": 0.04913595201274502 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.82, + "acc_stderr,none": 0.03861229196653691 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6170212765957447, + "acc_stderr,none": 0.03177821250236923 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6344827586206897, + "acc_stderr,none": 0.04013124195424389 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.46825396825396826, + "acc_stderr,none": 0.02569935283213174 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.7935483870967742, + "acc_stderr,none": 0.023025899617188667 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5763546798029556, + "acc_stderr,none": 0.03476725747649036 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.74, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.3888888888888889, + "acc_stderr,none": 0.0297232789614767 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.4900662251655629, + "acc_stderr,none": 0.040816771072484305 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5231481481481481, + "acc_stderr,none": 0.034063153607115024 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.45535714285714285, + "acc_stderr,none": 0.04726835553719097 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6623700327588663, + "acc_stderr,none": 0.0038108137883870693, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6259298618490967, + "acc_stderr,none": 0.006790849423879243, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7177341486964918, + "acc_stderr,none": 0.00777083586917588, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.7543061423464413, + "acc_stderr,none": 0.007629481064864609, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.5724706628607675, + "acc_stderr,none": 0.008442708678734078, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756373734.2537487, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6719448.803223402, + "end_time": 6723982.428276844, + "total_evaluation_time_seconds": "4533.625053442083" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-17-14.136330.json b/results/llama-3.1-8b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-17-14.136330.json new file mode 100644 index 0000000000000000000000000000000000000000..a987684624aa06c99bb2d8968c7344d04f48ed49 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T15-17-14.136330.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7976060935799782, + "acc_stderr,none": 0.009374289682807648, + "acc_norm,none": 0.794885745375408, + "acc_norm_stderr,none": 0.009420971671018023 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756394025.9041305, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6739723.437865958, + "end_time": 6740024.257117974, + "total_evaluation_time_seconds": "300.8192520160228" +} \ No newline at end of file diff --git a/results/llama-3.1-8b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-11-09.665476.json b/results/llama-3.1-8b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-11-09.665476.json new file mode 100644 index 0000000000000000000000000000000000000000..6e6b098bb702c789f290c3a47947dc81cba154c3 --- /dev/null +++ b/results/llama-3.1-8b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.1-8B-Instruct-GGUF/results_2025-08-28T16-11-09.665476.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.5716116807846634, + "exact_match_stderr,remove_whitespace": 0.0036942121228731735 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "gguf_file": "llama-3.1-8b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Meta-Llama-3.1-8B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.1-8B-Instruct-GGUF,gguf_file=llama-3.1-8b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_num_parameters": 8030261248, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "73c4e4d5ac2f0b4554477740ce9621999127f12f", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756394437.5566554, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.1-8B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.1-8B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "e10ca381b1ccc5cf9db52e371f3b6651576caee0a630b452e2816b2d404d4b65", + "start_time": 6740125.041809197, + "end_time": 6743259.785275262, + "total_evaluation_time_seconds": "3134.7434660652652" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T10-43-39.403807.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T10-43-39.403807.json new file mode 100644 index 0000000000000000000000000000000000000000..3c33791e9a3db5d75dab29f0620a40eb7d99263d --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T10-43-39.403807.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.28348214285714285, + "acc_stderr,none": 0.0213168289872622, + "acc_norm,none": 0.28348214285714285, + "acc_norm_stderr,none": 0.0213168289872622 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756464097.7877123, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6809802.016860311, + "end_time": 6810009.524530667, + "total_evaluation_time_seconds": "207.50767035596073" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T09-23-04.950976.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T09-23-04.950976.json new file mode 100644 index 0000000000000000000000000000000000000000..44522c73a4e039a0c4c103afab506cb9e1058380 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T09-23-04.950976.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4231228838876718, + "acc_stderr,none": 0.004930448527146583, + "acc_norm,none": 0.5246962756423024, + "acc_norm_stderr,none": 0.004983691099110917 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756459040.171013, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6804763.703988922, + "end_time": 6805175.0714428, + "total_evaluation_time_seconds": "411.3674538778141" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T10-13-20.039729.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T10-13-20.039729.json new file mode 100644 index 0000000000000000000000000000000000000000..cb1a3d3551c8efc3b7dd2f962ddb1d62d5f4d309 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T10-13-20.039729.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4374626568412667, + "acc_stderr,none": 0.004950598300667601, + "acc_norm,none": 0.576777534355706, + "acc_norm_stderr,none": 0.004930603061590628 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756459592.3099344, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6805284.145371093, + "end_time": 6808190.157087202, + "total_evaluation_time_seconds": "2906.011716108769" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-53-30.492986.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-53-30.492986.json new file mode 100644 index 0000000000000000000000000000000000000000..3e9236d210828cd2109af4f1b72084b8212dac48 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-53-30.492986.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.4232902033271719, + "prompt_level_strict_acc_stderr,none": 0.021261842325248494, + "inst_level_strict_acc,none": 0.5599520383693045, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.46210720887245843, + "prompt_level_loose_acc_stderr,none": 0.021454695436204742, + "inst_level_loose_acc,none": 0.592326139088729, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756477906.445423, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6823613.272541048, + "end_time": 6825000.613895395, + "total_evaluation_time_seconds": "1387.3413543468341" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T09-15-25.269759.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T09-15-25.269759.json new file mode 100644 index 0000000000000000000000000000000000000000..72d8877d309e7dc79b833231d6991757abef6151 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T09-15-25.269759.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.41069648198262354, + "acc_stderr,none": 0.004062788136992837, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.38235919234856536, + "acc_stderr,none": 0.006950737639650346, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.2698412698412698, + "acc_stderr,none": 0.0397015827323517 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5272727272727272, + "acc_stderr,none": 0.03898531605579422 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.5098039215686274, + "acc_stderr,none": 0.035086373586305744 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.4641350210970464, + "acc_stderr,none": 0.032463388980556646 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.5289256198347108, + "acc_stderr,none": 0.04556710331269495 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.5, + "acc_stderr,none": 0.04833682445228318 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.38650306748466257, + "acc_stderr,none": 0.03825825548848611 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.38439306358381503, + "acc_stderr,none": 0.026189666966272097 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.28268156424581004, + "acc_stderr,none": 0.015060381730018045 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.4983922829581994, + "acc_stderr,none": 0.02839794490780658 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.4876543209876543, + "acc_stderr,none": 0.027812262269327277 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3155149934810952, + "acc_stderr,none": 0.01186918484305873 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5847953216374269, + "acc_stderr,none": 0.03779275945503204 + }, + "mmlu_other": { + "acc,none": 0.46829739298358547, + "acc_stderr,none": 0.008820158090262145, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.38, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.4830188679245283, + "acc_stderr,none": 0.03075512036411982 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.41040462427745666, + "acc_stderr,none": 0.03750757044895538 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.4349775784753363, + "acc_stderr,none": 0.03327283370271337 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5631067961165048, + "acc_stderr,none": 0.049111471073657785 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.03255326307272487 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.38, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5734355044699873, + "acc_stderr,none": 0.01768606697567551 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5, + "acc_stderr,none": 0.028629916715693413 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.3120567375886525, + "acc_stderr,none": 0.027640120545169986 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.41544117647058826, + "acc_stderr,none": 0.02993534270787772 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3373493975903614, + "acc_stderr,none": 0.03680783690727582 + }, + "mmlu_social_sciences": { + "acc,none": 0.45791355216119595, + "acc_stderr,none": 0.008844465537366992, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.3157894736842105, + "acc_stderr,none": 0.04372748290278002 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5353535353535354, + "acc_stderr,none": 0.035534363688280626 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5181347150259067, + "acc_stderr,none": 0.03606065001832916 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.4025641025641026, + "acc_stderr,none": 0.02486499515976774 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.42436974789915966, + "acc_stderr,none": 0.03210479051015769 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.5889908256880734, + "acc_stderr,none": 0.021095050687277673 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.4732824427480916, + "acc_stderr,none": 0.04379024936553895 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.3431372549019608, + "acc_stderr,none": 0.01920660684882543 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.37272727272727274, + "acc_stderr,none": 0.0463138131942546 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.45714285714285713, + "acc_stderr,none": 0.03189141832421396 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.5323383084577115, + "acc_stderr,none": 0.03528131472933611 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.56, + "acc_stderr,none": 0.049888765156985884 + }, + "mmlu_stem": { + "acc,none": 0.35014272121788775, + "acc_stderr,none": 0.008352174426071701, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.5185185185185185, + "acc_stderr,none": 0.043163785995113245 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.5197368421052632, + "acc_stderr,none": 0.040657710025626057 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.4513888888888889, + "acc_stderr,none": 0.04161402398403282 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.2549019607843137, + "acc_stderr,none": 0.0433643270799318 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.42, + "acc_stderr,none": 0.04960449637488583 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.3872340425531915, + "acc_stderr,none": 0.031843892653395246 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.42758620689655175, + "acc_stderr,none": 0.041227371113703344 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.26455026455026454, + "acc_stderr,none": 0.022717467897708593 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.45161290322580644, + "acc_stderr,none": 0.02831050034856844 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.4236453201970443, + "acc_stderr,none": 0.034767257476490364 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.23333333333333334, + "acc_stderr,none": 0.02578787422095932 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.23841059602649006, + "acc_stderr,none": 0.03479185572599655 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.30092592592592593, + "acc_stderr,none": 0.03128039084329886 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.25892857142857145, + "acc_stderr,none": 0.04157751539865629 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.41069648198262354, + "acc_stderr,none": 0.004062788136992837, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.38235919234856536, + "acc_stderr,none": 0.006950737639650346, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.46829739298358547, + "acc_stderr,none": 0.008820158090262145, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.45791355216119595, + "acc_stderr,none": 0.008844465537366992, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.35014272121788775, + "acc_stderr,none": 0.008352174426071701, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756457782.6015804, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6803426.485551902, + "end_time": 6804715.390105791, + "total_evaluation_time_seconds": "1288.904553889297" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T10-17-22.800022.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T10-17-22.800022.json new file mode 100644 index 0000000000000000000000000000000000000000..ad5cf523e21554c6ce5753b6801528d79a273bb0 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T10-17-22.800022.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.6936887921653971, + "acc_stderr,none": 0.010754970032367363, + "acc_norm,none": 0.6996735582154516, + "acc_norm_stderr,none": 0.010695225308183266 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756462580.8541045, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6808297.029727637, + "end_time": 6808432.920860001, + "total_evaluation_time_seconds": "135.89113236404955" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T10-38-52.919418.json b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T10-38-52.919418.json new file mode 100644 index 0000000000000000000000000000000000000000..6b9b462ee4b89fc4a48502dea5df0c64f1fe66b1 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T10-38-52.919418.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.19215336602764155, + "exact_match_stderr,remove_whitespace": 0.0029413108812682298 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756462920.2700567, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6808559.778662505, + "end_time": 6809723.039604466, + "total_evaluation_time_seconds": "1163.2609419617802" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-51-42.351818.json b/results/llama-3.2-1b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-51-42.351818.json new file mode 100644 index 0000000000000000000000000000000000000000..6a599750711b3ac19387649a91d90ca5677cee61 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-51-42.351818.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.2767857142857143, + "acc_stderr,none": 0.021161749643954904, + "acc_norm,none": 0.2767857142857143, + "acc_norm_stderr,none": 0.021161749643954904 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756417771.6519642, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6763435.05272513, + "end_time": 6763692.472672896, + "total_evaluation_time_seconds": "257.4199477666989" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-31-45.423595.json b/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-31-45.423595.json new file mode 100644 index 0000000000000000000000000000000000000000..cc4e9a6dff658c2cc06f985c79ef04feb46d4690 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-31-45.423595.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4246166102370046, + "acc_stderr,none": 0.004932745013072505, + "acc_norm,none": 0.5249950209121689, + "acc_norm_stderr,none": 0.004983542768853415 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756412777.1090705, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6758499.739481736, + "end_time": 6758895.542821981, + "total_evaluation_time_seconds": "395.80334024503827" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-20-48.389784.json b/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-20-48.389784.json new file mode 100644 index 0000000000000000000000000000000000000000..45933e367368f53573125a9b6b7bf2ef4366e322 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-20-48.389784.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.43796056562437763, + "acc_stderr,none": 0.004951222171763173, + "acc_norm,none": 0.578868751244772, + "acc_norm_stderr,none": 0.004927314729433585 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756413230.828876, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6758944.853718012, + "end_time": 6761838.510691592, + "total_evaluation_time_seconds": "2893.6569735798985" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-29T12-50-00.797800.json b/results/llama-3.2-1b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-29T12-50-00.797800.json new file mode 100644 index 0000000000000000000000000000000000000000..37216f82ff394084c273a91a81bf11966938e774 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-29T12-50-00.797800.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.4195933456561922, + "prompt_level_strict_acc_stderr,none": 0.021236532548855144, + "inst_level_strict_acc,none": 0.5599520383693045, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.47504621072088726, + "prompt_level_loose_acc_stderr,none": 0.021489761058967286, + "inst_level_loose_acc,none": 0.6031175059952039, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756470395.368383, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6816092.650853158, + "end_time": 6817590.91863361, + "total_evaluation_time_seconds": "1498.2677804520354" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-24-16.768708.json b/results/llama-3.2-1b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-24-16.768708.json new file mode 100644 index 0000000000000000000000000000000000000000..8589943b12bef5edd16b6bdc0aeefc905973d8f9 --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T20-24-16.768708.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.4122632103688933, + "acc_stderr,none": 0.004062358370084786, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.38235919234856536, + "acc_stderr,none": 0.00694053336029471, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.2857142857142857, + "acc_stderr,none": 0.04040610178208843 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5393939393939394, + "acc_stderr,none": 0.038922070165520156 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.5294117647058824, + "acc_stderr,none": 0.03503235296367989 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.48523206751054854, + "acc_stderr,none": 0.032533028078777435 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.5041322314049587, + "acc_stderr,none": 0.045641987674327526 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.5092592592592593, + "acc_stderr,none": 0.04832853553437052 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.3987730061349693, + "acc_stderr,none": 0.038470214204560226 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.3815028901734104, + "acc_stderr,none": 0.026152198619726758 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.293854748603352, + "acc_stderr,none": 0.015235075776719512 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.5048231511254019, + "acc_stderr,none": 0.028396770444111305 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.48148148148148145, + "acc_stderr,none": 0.02780165621232363 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3011734028683181, + "acc_stderr,none": 0.011717148751648398 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5847953216374269, + "acc_stderr,none": 0.03779275945503204 + }, + "mmlu_other": { + "acc,none": 0.4689411007402639, + "acc_stderr,none": 0.008816728178191898, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.49056603773584906, + "acc_stderr,none": 0.03076739470780806 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.4161849710982659, + "acc_stderr,none": 0.037585177754049494 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.4484304932735426, + "acc_stderr,none": 0.033378837362551 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5631067961165048, + "acc_stderr,none": 0.049111471073657785 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.5683760683760684, + "acc_stderr,none": 0.03244835535311493 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5696040868454662, + "acc_stderr,none": 0.017705868776292367 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5032679738562091, + "acc_stderr,none": 0.02862930519400347 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.31560283687943264, + "acc_stderr,none": 0.02772498944950923 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.40441176470588236, + "acc_stderr,none": 0.029812630701569736 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3493975903614458, + "acc_stderr,none": 0.0371172519074075 + }, + "mmlu_social_sciences": { + "acc,none": 0.4614884627884303, + "acc_stderr,none": 0.008845951796652173, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.30701754385964913, + "acc_stderr,none": 0.04339138322579864 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5404040404040404, + "acc_stderr,none": 0.03550702465131341 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5440414507772021, + "acc_stderr,none": 0.03594413711272437 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.3974358974358974, + "acc_stderr,none": 0.024811920017903884 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.4369747899159664, + "acc_stderr,none": 0.032219436365662 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.5944954128440367, + "acc_stderr,none": 0.021050997991896817 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.48091603053435117, + "acc_stderr,none": 0.043820947055509894 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.3480392156862745, + "acc_stderr,none": 0.019270998708223946 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.4090909090909091, + "acc_stderr,none": 0.04709306978661896 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.44081632653061226, + "acc_stderr,none": 0.031784191141753605 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.5223880597014925, + "acc_stderr,none": 0.035319879302087256 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.56, + "acc_stderr,none": 0.049888765156985884 + }, + "mmlu_stem": { + "acc,none": 0.35299714557564227, + "acc_stderr,none": 0.008368981805254116, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.5185185185185185, + "acc_stderr,none": 0.043163785995113245 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.506578947368421, + "acc_stderr,none": 0.040685900502249725 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.4375, + "acc_stderr,none": 0.04148415739394154 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.2549019607843137, + "acc_stderr,none": 0.0433643270799318 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.47, + "acc_stderr,none": 0.05016135580465919 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.4085106382978723, + "acc_stderr,none": 0.03213418026701576 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.42758620689655175, + "acc_stderr,none": 0.041227371113703344 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.26455026455026454, + "acc_stderr,none": 0.022717467897708593 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.47096774193548385, + "acc_stderr,none": 0.028396016402761053 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.39408866995073893, + "acc_stderr,none": 0.03438157967036543 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.24074074074074073, + "acc_stderr,none": 0.02606715922227584 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.24503311258278146, + "acc_stderr,none": 0.035118075718047294 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.30092592592592593, + "acc_stderr,none": 0.03128039084329886 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.2857142857142857, + "acc_stderr,none": 0.042878587513404585 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.4122632103688933, + "acc_stderr,none": 0.004062358370084786, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.38235919234856536, + "acc_stderr,none": 0.00694053336029471, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.4689411007402639, + "acc_stderr,none": 0.008816728178191898, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.4614884627884303, + "acc_stderr,none": 0.008845951796652173, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.35299714557564227, + "acc_stderr,none": 0.008368981805254116, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756411500.0554235, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6757221.555934522, + "end_time": 6758446.889011447, + "total_evaluation_time_seconds": "1225.333076925017" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-24-51.469470.json b/results/llama-3.2-1b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-24-51.469470.json new file mode 100644 index 0000000000000000000000000000000000000000..254239b1634f1b6bcecbc6a1b9ff56c993be481d --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-24-51.469470.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7029379760609358, + "acc_stderr,none": 0.010661725404814675, + "acc_norm,none": 0.7002176278563657, + "acc_norm_stderr,none": 0.010689686967138 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756416224.5743294, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6761914.906331075, + "end_time": 6762081.590392968, + "total_evaluation_time_seconds": "166.68406189233065" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-45-46.626894.json b/results/llama-3.2-1b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-45-46.626894.json new file mode 100644 index 0000000000000000000000000000000000000000..65889637cf30ec2ca77738dc251a0530bd6f1c6c --- /dev/null +++ b/results/llama-3.2-1b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-Instruct-GGUF/results_2025-08-28T21-45-46.626894.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.19003566651805617, + "exact_match_stderr,remove_whitespace": 0.0029288894405590585 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "gguf_file": "llama-3.2-1b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-Instruct-GGUF,gguf_file=llama-3.2-1b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B-Instruct", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b1d8fe7ec042f9b613dccaf07af2e0eacdcc65b5", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756416506.4349973, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6762170.177418074, + "end_time": 6763336.747687881, + "total_evaluation_time_seconds": "1166.5702698072419" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T08-36-32.696870.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T08-36-32.696870.json new file mode 100644 index 0000000000000000000000000000000000000000..59a2f10489f1e19a4c5a0e8218e597bbcbfb0f9d --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T08-36-32.696870.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.44761999601672975, + "acc_stderr,none": 0.004962325297840594, + "acc_norm,none": 0.5952001593308106, + "acc_norm_stderr,none": 0.004898501014226157 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756110754.0641475, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10242119.679062672, + "end_time": 10242416.135534978, + "total_evaluation_time_seconds": "296.456472305581" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T17-21-28.277871.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T17-21-28.277871.json new file mode 100644 index 0000000000000000000000000000000000000000..d6fb6d5ee2e2751c9f5c53422ff17de0c4b6b8c4 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T17-21-28.277871.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.44761999601672975, + "acc_stderr,none": 0.004962325297840594, + "acc_norm,none": 0.5952001593308106, + "acc_norm_stderr,none": 0.004898501014226157 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756142249.188221, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10273613.8154548, + "end_time": 10273911.71645836, + "total_evaluation_time_seconds": "297.9010035600513" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T15-39-22.954616.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T15-39-22.954616.json new file mode 100644 index 0000000000000000000000000000000000000000..de781596c2e88be8c2f0897835b8a900e8f7ca16 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T15-39-22.954616.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.44761999601672975, + "acc_stderr,none": 0.004962325297840594, + "acc_norm,none": 0.5952001593308106, + "acc_norm_stderr,none": 0.004898501014226157 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756222523.8884416, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10353882.643228102, + "end_time": 10354186.393249808, + "total_evaluation_time_seconds": "303.7500217054039" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T09-18-15.038165.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T09-18-15.038165.json new file mode 100644 index 0000000000000000000000000000000000000000..f73a1be3b2b88e1be9e9e4a5c0c85565a4e60d3c --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T09-18-15.038165.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45200159330810596, + "acc_stderr,none": 0.004966736811010784, + "acc_norm,none": 0.6101374228241386, + "acc_norm_stderr,none": 0.0048672216344610745 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756111093.1084895, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10242458.876950616, + "end_time": 10244918.476882713, + "total_evaluation_time_seconds": "2459.5999320968986" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T18-03-25.628978.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T18-03-25.628978.json new file mode 100644 index 0000000000000000000000000000000000000000..435f018263eb843cee89c7658fb1396a310cb06d --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T18-03-25.628978.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45200159330810596, + "acc_stderr,none": 0.004966736811010784, + "acc_norm,none": 0.6101374228241386, + "acc_norm_stderr,none": 0.0048672216344610745 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756142590.9302, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10273955.347320303, + "end_time": 10276429.067681711, + "total_evaluation_time_seconds": "2473.720361407846" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T16-21-01.953576.json b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T16-21-01.953576.json new file mode 100644 index 0000000000000000000000000000000000000000..e776be92873def2c9615b92b64481556d331fb6a --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T16-21-01.953576.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45200159330810596, + "acc_stderr,none": 0.004966736811010784, + "acc_norm,none": 0.6101374228241386, + "acc_norm_stderr,none": 0.0048672216344610745 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756222866.3488133, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10354230.285668558, + "end_time": 10356685.392329829, + "total_evaluation_time_seconds": "2455.106661271304" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T08-30-53.451111.json b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T08-30-53.451111.json new file mode 100644 index 0000000000000000000000000000000000000000..5a8a61ffdfd67eba2663d7eaa5525db0e19e79c5 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T08-30-53.451111.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.1984126984126984, + "acc_stderr,none": 0.03567016675276862 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.2696078431372549, + "acc_stderr,none": 0.0311455706594868 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.270042194092827, + "acc_stderr,none": 0.02890072190629346 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.30578512396694213, + "acc_stderr,none": 0.04205953933884123 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.32407407407407407, + "acc_stderr,none": 0.04524596007030053 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.26993865030674846, + "acc_stderr,none": 0.034878251684978955 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.2543352601156069, + "acc_stderr,none": 0.02344582627654551 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2424581005586592, + "acc_stderr,none": 0.01433352205921795 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.2540192926045016, + "acc_stderr,none": 0.024723861504771655 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.33024691358024694, + "acc_stderr,none": 0.026168298456732783 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27183833116036504, + "acc_stderr,none": 0.011363135278651513 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.23976608187134502, + "acc_stderr,none": 0.03274485211946959 + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2830188679245283, + "acc_stderr,none": 0.027724236492700883 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.27167630057803466, + "acc_stderr,none": 0.03391750322321658 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.32286995515695066, + "acc_stderr,none": 0.031381476375754995 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.22330097087378642, + "acc_stderr,none": 0.041235531898914324 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.31196581196581197, + "acc_stderr,none": 0.030351527323344996 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.3665389527458493, + "acc_stderr,none": 0.017231244626797086 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.2973856209150327, + "acc_stderr,none": 0.026173908506718593 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.25177304964539005, + "acc_stderr,none": 0.025892151156709366 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.26838235294117646, + "acc_stderr,none": 0.026917481224377256 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3433734939759036, + "acc_stderr,none": 0.036965843170105976 + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.23684210526315788, + "acc_stderr,none": 0.03999423879281335 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.35353535353535354, + "acc_stderr,none": 0.03406086723547151 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.34196891191709844, + "acc_stderr,none": 0.034234651001042775 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2846153846153846, + "acc_stderr,none": 0.022878322799706304 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.25630252100840334, + "acc_stderr,none": 0.028359620870533925 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.26972477064220185, + "acc_stderr,none": 0.019028486711115473 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.272875816993464, + "acc_stderr,none": 0.01802047414839352 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2636363636363636, + "acc_stderr,none": 0.04220224692971989 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.3551020408163265, + "acc_stderr,none": 0.03063565515038758 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.25870646766169153, + "acc_stderr,none": 0.030965903123573005 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620332 + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.2814814814814815, + "acc_stderr,none": 0.03885004245800249 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.25, + "acc_stderr,none": 0.03523807393012047 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2152777777777778, + "acc_stderr,none": 0.034370793441061386 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.24, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.19607843137254902, + "acc_stderr,none": 0.03950581861179961 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.33191489361702126, + "acc_stderr,none": 0.030783736757745678 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2689655172413793, + "acc_stderr,none": 0.03695183311650232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.022569897074918445 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.33548387096774196, + "acc_stderr,none": 0.026860206444724297 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2512315270935961, + "acc_stderr,none": 0.03051653073269444 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.27037037037037037, + "acc_stderr,none": 0.027080372815145717 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.24503311258278146, + "acc_stderr,none": 0.035118075718047294 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4305555555555556, + "acc_stderr,none": 0.03376922151252338 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3125, + "acc_stderr,none": 0.043994650575715215 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 57, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756109679.952225, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10241039.58338538, + "end_time": 10242076.889224041, + "total_evaluation_time_seconds": "1037.3058386612684" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T17-15-46.830511.json b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T17-15-46.830511.json new file mode 100644 index 0000000000000000000000000000000000000000..4848e575767f2c16b38f3a7545b9532280f5b517 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T17-15-46.830511.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.1984126984126984, + "acc_stderr,none": 0.03567016675276862 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.2696078431372549, + "acc_stderr,none": 0.0311455706594868 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.270042194092827, + "acc_stderr,none": 0.02890072190629346 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.30578512396694213, + "acc_stderr,none": 0.04205953933884123 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.32407407407407407, + "acc_stderr,none": 0.04524596007030053 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.26993865030674846, + "acc_stderr,none": 0.034878251684978955 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.2543352601156069, + "acc_stderr,none": 0.02344582627654551 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2424581005586592, + "acc_stderr,none": 0.01433352205921795 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.2540192926045016, + "acc_stderr,none": 0.024723861504771655 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.33024691358024694, + "acc_stderr,none": 0.026168298456732783 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27183833116036504, + "acc_stderr,none": 0.011363135278651513 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.23976608187134502, + "acc_stderr,none": 0.03274485211946959 + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2830188679245283, + "acc_stderr,none": 0.027724236492700883 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.27167630057803466, + "acc_stderr,none": 0.03391750322321658 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.32286995515695066, + "acc_stderr,none": 0.031381476375754995 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.22330097087378642, + "acc_stderr,none": 0.041235531898914324 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.31196581196581197, + "acc_stderr,none": 0.030351527323344996 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.3665389527458493, + "acc_stderr,none": 0.017231244626797086 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.2973856209150327, + "acc_stderr,none": 0.026173908506718593 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.25177304964539005, + "acc_stderr,none": 0.025892151156709366 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.26838235294117646, + "acc_stderr,none": 0.026917481224377256 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3433734939759036, + "acc_stderr,none": 0.036965843170105976 + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.23684210526315788, + "acc_stderr,none": 0.03999423879281335 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.35353535353535354, + "acc_stderr,none": 0.03406086723547151 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.34196891191709844, + "acc_stderr,none": 0.034234651001042775 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2846153846153846, + "acc_stderr,none": 0.022878322799706304 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.25630252100840334, + "acc_stderr,none": 0.028359620870533925 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.26972477064220185, + "acc_stderr,none": 0.019028486711115473 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.272875816993464, + "acc_stderr,none": 0.01802047414839352 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2636363636363636, + "acc_stderr,none": 0.04220224692971989 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.3551020408163265, + "acc_stderr,none": 0.03063565515038758 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.25870646766169153, + "acc_stderr,none": 0.030965903123573005 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620332 + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.2814814814814815, + "acc_stderr,none": 0.03885004245800249 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.25, + "acc_stderr,none": 0.03523807393012047 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2152777777777778, + "acc_stderr,none": 0.034370793441061386 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.24, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.19607843137254902, + "acc_stderr,none": 0.03950581861179961 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.33191489361702126, + "acc_stderr,none": 0.030783736757745678 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2689655172413793, + "acc_stderr,none": 0.03695183311650232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.022569897074918445 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.33548387096774196, + "acc_stderr,none": 0.026860206444724297 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2512315270935961, + "acc_stderr,none": 0.03051653073269444 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.27037037037037037, + "acc_stderr,none": 0.027080372815145717 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.24503311258278146, + "acc_stderr,none": 0.035118075718047294 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4305555555555556, + "acc_stderr,none": 0.03376922151252338 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3125, + "acc_stderr,none": 0.043994650575715215 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 57, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756141173.9658904, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10272538.165338067, + "end_time": 10273570.268583192, + "total_evaluation_time_seconds": "1032.1032451242208" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T15-33-35.640341.json b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T15-33-35.640341.json new file mode 100644 index 0000000000000000000000000000000000000000..c26a72af2a7e7bab2bf2d65dc5162a1717ac37ee --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T15-33-35.640341.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.1984126984126984, + "acc_stderr,none": 0.03567016675276862 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.28484848484848485, + "acc_stderr,none": 0.03524390844511785 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.2696078431372549, + "acc_stderr,none": 0.0311455706594868 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.270042194092827, + "acc_stderr,none": 0.02890072190629346 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.30578512396694213, + "acc_stderr,none": 0.04205953933884123 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.32407407407407407, + "acc_stderr,none": 0.04524596007030053 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.26993865030674846, + "acc_stderr,none": 0.034878251684978955 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.2543352601156069, + "acc_stderr,none": 0.02344582627654551 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2424581005586592, + "acc_stderr,none": 0.01433352205921795 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.2540192926045016, + "acc_stderr,none": 0.024723861504771655 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.33024691358024694, + "acc_stderr,none": 0.026168298456732783 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.27183833116036504, + "acc_stderr,none": 0.011363135278651513 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.23976608187134502, + "acc_stderr,none": 0.03274485211946959 + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2830188679245283, + "acc_stderr,none": 0.027724236492700883 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.27167630057803466, + "acc_stderr,none": 0.03391750322321658 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.32286995515695066, + "acc_stderr,none": 0.031381476375754995 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.22330097087378642, + "acc_stderr,none": 0.041235531898914324 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.31196581196581197, + "acc_stderr,none": 0.030351527323344996 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.3665389527458493, + "acc_stderr,none": 0.017231244626797086 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.2973856209150327, + "acc_stderr,none": 0.026173908506718593 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.25177304964539005, + "acc_stderr,none": 0.025892151156709366 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.26838235294117646, + "acc_stderr,none": 0.026917481224377256 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3433734939759036, + "acc_stderr,none": 0.036965843170105976 + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.23684210526315788, + "acc_stderr,none": 0.03999423879281335 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.35353535353535354, + "acc_stderr,none": 0.03406086723547151 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.34196891191709844, + "acc_stderr,none": 0.034234651001042775 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2846153846153846, + "acc_stderr,none": 0.022878322799706304 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.25630252100840334, + "acc_stderr,none": 0.028359620870533925 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.26972477064220185, + "acc_stderr,none": 0.019028486711115473 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.272875816993464, + "acc_stderr,none": 0.01802047414839352 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.2636363636363636, + "acc_stderr,none": 0.04220224692971989 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.3551020408163265, + "acc_stderr,none": 0.03063565515038758 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.25870646766169153, + "acc_stderr,none": 0.030965903123573005 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620332 + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.2814814814814815, + "acc_stderr,none": 0.03885004245800249 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.25, + "acc_stderr,none": 0.03523807393012047 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.2152777777777778, + "acc_stderr,none": 0.034370793441061386 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.24, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.19607843137254902, + "acc_stderr,none": 0.03950581861179961 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.33191489361702126, + "acc_stderr,none": 0.030783736757745678 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2689655172413793, + "acc_stderr,none": 0.03695183311650232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.022569897074918445 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.33548387096774196, + "acc_stderr,none": 0.026860206444724297 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2512315270935961, + "acc_stderr,none": 0.03051653073269444 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.27037037037037037, + "acc_stderr,none": 0.027080372815145717 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.24503311258278146, + "acc_stderr,none": 0.035118075718047294 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4305555555555556, + "acc_stderr,none": 0.03376922151252338 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3125, + "acc_stderr,none": 0.043994650575715215 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.28599914542088023, + "acc_stderr,none": 0.003801407132730797, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2669500531349628, + "acc_stderr,none": 0.006446522380828226, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.3070485999356292, + "acc_stderr,none": 0.008248304637238035, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.2908677283067923, + "acc_stderr,none": 0.008159371619662978, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.2889311766571519, + "acc_stderr,none": 0.008038240405836967, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 57, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756221435.3619773, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10352800.47994225, + "end_time": 10353839.078370897, + "total_evaluation_time_seconds": "1038.5984286479652" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T09-20-51.344459.json b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T09-20-51.344459.json new file mode 100644 index 0000000000000000000000000000000000000000..1b7067210819f99b65d6c8792fc5fb90f93d02bf --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T09-20-51.344459.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7241566920565833, + "acc_stderr,none": 0.010427805502729186, + "acc_norm,none": 0.735582154515778, + "acc_norm_stderr,none": 0.010289787244767095 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756113597.7233505, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10244962.217351785, + "end_time": 10245074.783188416, + "total_evaluation_time_seconds": "112.56583663076162" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T18-06-17.038339.json b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T18-06-17.038339.json new file mode 100644 index 0000000000000000000000000000000000000000..af8ef637ba078d6f1f2624afc7b1671d5187c547 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T18-06-17.038339.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7241566920565833, + "acc_stderr,none": 0.010427805502729186, + "acc_norm,none": 0.735582154515778, + "acc_norm_stderr,none": 0.010289787244767095 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756145108.136912, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10276472.69843241, + "end_time": 10276600.477051124, + "total_evaluation_time_seconds": "127.77861871384084" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T16-23-35.432328.json b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T16-23-35.432328.json new file mode 100644 index 0000000000000000000000000000000000000000..b334acd03939da107fb761952498dbc51a1268cb --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T16-23-35.432328.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7241566920565833, + "acc_stderr,none": 0.010427805502729186, + "acc_norm,none": 0.735582154515778, + "acc_norm_stderr,none": 0.010289787244767095 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756225362.6126778, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10356728.147221912, + "end_time": 10356838.871051783, + "total_evaluation_time_seconds": "110.72382987104356" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T09-37-30.974935.json b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T09-37-30.974935.json new file mode 100644 index 0000000000000000000000000000000000000000..a06df1934be3d744e1ed12cb59544603ddee917a --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T09-37-30.974935.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.25780205082478824, + "exact_match_stderr,remove_whitespace": 0.003265543928768584 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756113753.3069797, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10245117.947245015, + "end_time": 10246074.412016004, + "total_evaluation_time_seconds": "956.4647709894925" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T18-22-45.486705.json b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T18-22-45.486705.json new file mode 100644 index 0000000000000000000000000000000000000000..7f5feb89afebbd72ab3d9392b144dbd07ccd9064 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T18-22-45.486705.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.25780205082478824, + "exact_match_stderr,remove_whitespace": 0.003265543928768584 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756145278.8220675, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10276643.694116846, + "end_time": 10277588.924686773, + "total_evaluation_time_seconds": "945.2305699270219" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T16-39-53.723111.json b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T16-39-53.723111.json new file mode 100644 index 0000000000000000000000000000000000000000..8080318bc9c4673d88f96178bfc614bff543f305 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T16-39-53.723111.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.25780205082478824, + "exact_match_stderr,remove_whitespace": 0.003265543928768584 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-1b-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-1b-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756225516.532917, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10356881.65025422, + "end_time": 10357817.16179386, + "total_evaluation_time_seconds": "935.5115396399051" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-52-39.302899.json b/results/llama-3.2-1b-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-52-39.302899.json new file mode 100644 index 0000000000000000000000000000000000000000..75186894391e249b1f14fd4a4f4b7aefecdbf42a --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m/hellaswag-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-52-39.302899.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.44921330412268473, + "acc_stderr,none": 0.004963974504003149, + "acc_norm,none": 0.5973909579764987, + "acc_norm_stderr,none": 0.004894210011303106 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-GGUF,gguf_file=llama-3.2-1b-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "08a09fef6184c41b4f38aa088140d25d4de3272e", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756190916.6981637, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10322281.634510616, + "end_time": 10322582.741031472, + "total_evaluation_time_seconds": "301.1065208557993" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-34-32.340944.json b/results/llama-3.2-1b-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-34-32.340944.json new file mode 100644 index 0000000000000000000000000000000000000000..bbbe4dc514d870f7863c9244471625c182e09eff --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m/hellaswag-10/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-34-32.340944.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.45309699263095, + "acc_stderr,none": 0.004967778940012014, + "acc_norm,none": 0.611431985660227, + "acc_norm_stderr,none": 0.004864286176731834 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-GGUF,gguf_file=llama-3.2-1b-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "08a09fef6184c41b4f38aa088140d25d4de3272e", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756191260.928619, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10322625.986424657, + "end_time": 10325095.77965906, + "total_evaluation_time_seconds": "2469.7932344041765" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-46-54.724960.json b/results/llama-3.2-1b-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-46-54.724960.json new file mode 100644 index 0000000000000000000000000000000000000000..377d3d3f63d8e859bb346315b2b6097a42533845 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m/mmlu-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T06-46-54.724960.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.28507335137444806, + "acc_stderr,none": 0.003796560942944839, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2637619553666312, + "acc_stderr,none": 0.006413563221982008, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.19047619047619047, + "acc_stderr,none": 0.03512207412302052 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.26666666666666666, + "acc_stderr,none": 0.03453131801885415 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.25980392156862747, + "acc_stderr,none": 0.030778554678693247 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.270042194092827, + "acc_stderr,none": 0.02890072190629346 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.3305785123966942, + "acc_stderr,none": 0.042943408452120926 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.3148148148148148, + "acc_stderr,none": 0.04489931073591309 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.2392638036809816, + "acc_stderr,none": 0.033519538795212675 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.24277456647398843, + "acc_stderr,none": 0.02308365858698422 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2424581005586592, + "acc_stderr,none": 0.01433352205921795 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.2315112540192926, + "acc_stderr,none": 0.023956532766639154 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.3611111111111111, + "acc_stderr,none": 0.026725868809100835 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.26727509778357234, + "acc_stderr,none": 0.011302607515637565 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.25146198830409355, + "acc_stderr,none": 0.03327504423846844 + }, + "mmlu_other": { + "acc,none": 0.3105889925973608, + "acc_stderr,none": 0.008279672308568298, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.2981132075471698, + "acc_stderr,none": 0.028152837942493788 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.2658959537572254, + "acc_stderr,none": 0.033687629322594316 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.2, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.34080717488789236, + "acc_stderr,none": 0.03181149747055356 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.22330097087378642, + "acc_stderr,none": 0.041235531898914324 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.32905982905982906, + "acc_stderr,none": 0.030782321577688235 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.3563218390804598, + "acc_stderr,none": 0.017125853762755817 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.3006535947712418, + "acc_stderr,none": 0.026256053835718912 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.2624113475177305, + "acc_stderr,none": 0.02624492034984306 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.2757352941176471, + "acc_stderr,none": 0.02714627193662515 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.3614457831325301, + "acc_stderr,none": 0.03740059382029325 + }, + "mmlu_social_sciences": { + "acc,none": 0.29119272018199543, + "acc_stderr,none": 0.008162244622091067, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.2894736842105263, + "acc_stderr,none": 0.04266339443159392 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.37373737373737376, + "acc_stderr,none": 0.03446897738659336 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.32642487046632124, + "acc_stderr,none": 0.03384028621143298 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.2743589743589744, + "acc_stderr,none": 0.022622765767493235 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.25210084033613445, + "acc_stderr,none": 0.02820554503327774 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.25137614678899084, + "acc_stderr,none": 0.018599206360287425 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.24427480916030533, + "acc_stderr,none": 0.037683359597287434 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.2826797385620915, + "acc_stderr,none": 0.018217269552053494 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.3090909090909091, + "acc_stderr,none": 0.04426294648200096 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.363265306122449, + "acc_stderr,none": 0.030789051139030802 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.2537313432835821, + "acc_stderr,none": 0.030769444967295972 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.43, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_stem": { + "acc,none": 0.2857595940374247, + "acc_stderr,none": 0.008017441081836094, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.03785714465066651 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.2631578947368421, + "acc_stderr,none": 0.03583496176361067 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.20833333333333334, + "acc_stderr,none": 0.03396116205845331 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.25, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.21568627450980393, + "acc_stderr,none": 0.04092563958237658 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.35, + "acc_stderr,none": 0.04793724854411023 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.33617021276595743, + "acc_stderr,none": 0.03088161852067694 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.2620689655172414, + "acc_stderr,none": 0.03664666337225255 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.25925925925925924, + "acc_stderr,none": 0.022569897074918445 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.3225806451612903, + "acc_stderr,none": 0.026593084516572225 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.2413793103448276, + "acc_stderr,none": 0.030108330718011642 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.26296296296296295, + "acc_stderr,none": 0.026842057873833727 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.25165562913907286, + "acc_stderr,none": 0.03543304234389988 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4212962962962963, + "acc_stderr,none": 0.03367462138896084 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3125, + "acc_stderr,none": 0.043994650575715215 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.28507335137444806, + "acc_stderr,none": 0.003796560942944839, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.2637619553666312, + "acc_stderr,none": 0.006413563221982008, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.3105889925973608, + "acc_stderr,none": 0.008279672308568298, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.29119272018199543, + "acc_stderr,none": 0.008162244622091067, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.2857595940374247, + "acc_stderr,none": 0.008017441081836094, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-GGUF,gguf_file=llama-3.2-1b-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "08a09fef6184c41b4f38aa088140d25d4de3272e", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 57, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756189840.9523957, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10321205.43584358, + "end_time": 10322238.163029933, + "total_evaluation_time_seconds": "1032.7271863538772" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-37-11.796226.json b/results/llama-3.2-1b-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-37-11.796226.json new file mode 100644 index 0000000000000000000000000000000000000000..e6e2251ab7940e437772d4bac4ebef62dbda712b --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m/piqa-0/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-37-11.796226.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7236126224156693, + "acc_stderr,none": 0.010434162388275511, + "acc_norm,none": 0.73449401523395, + "acc_norm_stderr,none": 0.01030330865302444 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-GGUF,gguf_file=llama-3.2-1b-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "08a09fef6184c41b4f38aa088140d25d4de3272e", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756193775.4921603, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10325139.704248251, + "end_time": 10325255.23477333, + "total_evaluation_time_seconds": "115.53052507899702" +} \ No newline at end of file diff --git a/results/llama-3.2-1b-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-53-43.574236.json b/results/llama-3.2-1b-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-53-43.574236.json new file mode 100644 index 0000000000000000000000000000000000000000..a9f069c3eefc206721a7a5bef94ece54ee035041 --- /dev/null +++ b/results/llama-3.2-1b-q3_k_m/triviaqa-5/skymizer__Llama-3.2-1B-GGUF/results_2025-08-26T07-53-43.574236.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.2599754792688364, + "exact_match_stderr,remove_whitespace": 0.00327447531162596 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Llama-3.2-1B-GGUF", + "gguf_file": "llama-3.2-1b-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-1B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-1B-GGUF,gguf_file=llama-3.2-1b-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-1B", + "model_num_parameters": 1235814400, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "08a09fef6184c41b4f38aa088140d25d4de3272e", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756193932.6231658, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_eos_token": [ + "<|end_of_text|>", + "128001" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128001, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-1B-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-1B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10325298.129203964, + "end_time": 10326247.01243518, + "total_evaluation_time_seconds": "948.883231215179" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T08-48-14.697874.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T08-48-14.697874.json new file mode 100644 index 0000000000000000000000000000000000000000..744b9291fc06e7c765067e9167b0ea52aebf40c1 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/gpqa_main_zeroshot/.__models__/results_2025-08-29T08-48-14.697874.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.26339285714285715, + "acc_stderr,none": 0.02083369001657866, + "acc_norm,none": 0.26339285714285715, + "acc_norm_stderr,none": 0.02083369001657866 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756457041.0766351, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6802727.02747147, + "end_time": 6803084.818811165, + "total_evaluation_time_seconds": "357.79133969545364" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T05-52-58.798334.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T05-52-58.798334.json new file mode 100644 index 0000000000000000000000000000000000000000..2e3c927647d1373577fc961ac3e9367ae7d34b90 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-29T05-52-58.798334.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.49522007568213505, + "acc_stderr,none": 0.004989553396413285, + "acc_norm,none": 0.6460864369647481, + "acc_norm_stderr,none": 0.004772054904404475 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756446060.0651374, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6791734.457381064, + "end_time": 6792568.917448073, + "total_evaluation_time_seconds": "834.460067008622" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T07-41-45.130092.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T07-41-45.130092.json new file mode 100644 index 0000000000000000000000000000000000000000..b9f2a9407635647344be6289c79d6038b6f64cd0 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-29T07-41-45.130092.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5183230432184823, + "acc_stderr,none": 0.004986429808146497, + "acc_norm,none": 0.6964748058155746, + "acc_norm_stderr,none": 0.004588403419450036 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 28, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756446971.0941148, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6792654.649654925, + "end_time": 6799095.245380173, + "total_evaluation_time_seconds": "6440.595725248568" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-29-16.809186.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-29-16.809186.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f46e0b923b381123edc413978461ffbce79574 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/ifeval/.__models__/results_2025-08-29T14-29-16.809186.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.6543438077634011, + "prompt_level_strict_acc_stderr,none": 0.02046577943268254, + "inst_level_strict_acc,none": 0.7482014388489209, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.7097966728280961, + "prompt_level_loose_acc_stderr,none": 0.019530856691222623, + "inst_level_loose_acc,none": 0.7925659472422062, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756475253.6274798, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6820977.177537752, + "end_time": 6823546.93018596, + "total_evaluation_time_seconds": "2569.752648207359" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T05-37-06.773457.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T05-37-06.773457.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b793ee8e3844be78459cd5f7b95e8c7b29be65 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-29T05-37-06.773457.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.5916536105967811, + "acc_stderr,none": 0.003986785253705166, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.5581296493092455, + "acc_stderr,none": 0.006992363609102954, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.3888888888888889, + "acc_stderr,none": 0.043603148600774626 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7333333333333333, + "acc_stderr,none": 0.034531318018854146 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.7205882352941176, + "acc_stderr,none": 0.031493281045079584 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7805907172995781, + "acc_stderr,none": 0.026939106581553945 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6942148760330579, + "acc_stderr,none": 0.04205953933884123 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.6851851851851852, + "acc_stderr,none": 0.0448993107359131 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6748466257668712, + "acc_stderr,none": 0.036803503712864595 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.6502890173410405, + "acc_stderr,none": 0.02567428145653101 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.47374301675977654, + "acc_stderr,none": 0.016699427672784813 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6816720257234726, + "acc_stderr,none": 0.026457225067810987 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.654320987654321, + "acc_stderr,none": 0.02646248777700184 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.4276401564537158, + "acc_stderr,none": 0.012635799922765681 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7426900584795322, + "acc_stderr,none": 0.03352799844161867 + }, + "mmlu_other": { + "acc,none": 0.6585130350820727, + "acc_stderr,none": 0.008216723876376786, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6490566037735849, + "acc_stderr,none": 0.029373646253234693 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.5953757225433526, + "acc_stderr,none": 0.037424611938872455 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6143497757847534, + "acc_stderr,none": 0.03266842214289207 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7766990291262136, + "acc_stderr,none": 0.041235531898914324 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8504273504273504, + "acc_stderr,none": 0.023365051491753757 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.75, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.7586206896551724, + "acc_stderr,none": 0.015302380123542047 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.673202614379085, + "acc_stderr,none": 0.026857294663281482 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.46099290780141844, + "acc_stderr,none": 0.029736592526424455 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.6654411764705882, + "acc_stderr,none": 0.028661996202335335 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4939759036144578, + "acc_stderr,none": 0.03892212195333041 + }, + "mmlu_social_sciences": { + "acc,none": 0.6642833929151771, + "acc_stderr,none": 0.008328651903138825, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.41228070175438597, + "acc_stderr,none": 0.046306532033665936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7828282828282829, + "acc_stderr,none": 0.02937661648494561 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.7979274611398963, + "acc_stderr,none": 0.028979089794296756 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.5794871794871795, + "acc_stderr,none": 0.02502861027671089 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.5882352941176471, + "acc_stderr,none": 0.031968769891957786 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.763302752293578, + "acc_stderr,none": 0.018224078117299015 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6717557251908397, + "acc_stderr,none": 0.041184385658063025 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.576797385620915, + "acc_stderr,none": 0.01998780976948206 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5818181818181818, + "acc_stderr,none": 0.04724577405731571 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6938775510204082, + "acc_stderr,none": 0.029504896454595975 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7512437810945274, + "acc_stderr,none": 0.030567675938916686 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_stem": { + "acc,none": 0.5049159530605772, + "acc_stderr,none": 0.008646360981692787, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.562962962962963, + "acc_stderr,none": 0.042849586397534056 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.5986842105263158, + "acc_stderr,none": 0.03988903703336284 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.6875, + "acc_stderr,none": 0.038760854559127644 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620332 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.52, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3431372549019608, + "acc_stderr,none": 0.04724007352383884 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.64, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.5106382978723404, + "acc_stderr,none": 0.03267862331014062 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5448275862068965, + "acc_stderr,none": 0.04149886942192114 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.41534391534391535, + "acc_stderr,none": 0.025379524910778408 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.7322580645161291, + "acc_stderr,none": 0.025189006660212412 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5369458128078818, + "acc_stderr,none": 0.03508370520442665 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.61, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37777777777777777, + "acc_stderr,none": 0.029560707392465774 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.39072847682119205, + "acc_stderr,none": 0.039837983066598075 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5324074074074074, + "acc_stderr,none": 0.03402801581358966 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.35714285714285715, + "acc_stderr,none": 0.045479609997643805 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.5916536105967811, + "acc_stderr,none": 0.003986785253705166, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.5581296493092455, + "acc_stderr,none": 0.006992363609102954, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.6585130350820727, + "acc_stderr,none": 0.008216723876376786, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.6642833929151771, + "acc_stderr,none": 0.008328651903138825, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.5049159530605772, + "acc_stderr,none": 0.008646360981692787, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756443566.1156235, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6789213.786148419, + "end_time": 6791616.893799104, + "total_evaluation_time_seconds": "2403.107650685124" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T08-05-15.014739.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T08-05-15.014739.json new file mode 100644 index 0000000000000000000000000000000000000000..0eb4e97d7fc422f10ac088f59f0e9d557b9a7cbf --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-29T08-05-15.014739.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.750272034820457, + "acc_stderr,none": 0.010099232969867361, + "acc_norm,none": 0.750272034820457, + "acc_norm_stderr,none": 0.010099232969867361 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756454557.8259547, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6800187.331599416, + "end_time": 6800505.135525793, + "total_evaluation_time_seconds": "317.8039263766259" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T08-40-55.918050.json b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T08-40-55.918050.json new file mode 100644 index 0000000000000000000000000000000000000000..61ba5d6a29c41ec81ebbca0e2212170deb5d6ea9 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-29T08-40-55.918050.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.4182456531431119, + "exact_match_stderr,remove_whitespace": 0.0036824597442680776 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=llama-3.2-3b-instruct-q3_k_m-dc-b10.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756454997.5057132, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6800624.858237982, + "end_time": 6802646.038971434, + "total_evaluation_time_seconds": "2021.180733452551" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-59-26.706349.json b/results/llama-3.2-3b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-59-26.706349.json new file mode 100644 index 0000000000000000000000000000000000000000..2abe7ead691ea41fca0d67bb734f7baf4e80b2f0 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/gpqa_main_zeroshot/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-59-26.706349.json @@ -0,0 +1,133 @@ +{ + "results": { + "gpqa_main_zeroshot": { + "alias": "gpqa_main_zeroshot", + "acc,none": 0.25892857142857145, + "acc_stderr,none": 0.020718879324472094, + "acc_norm,none": 0.25892857142857145, + "acc_norm_stderr,none": 0.020718879324472094 + } + }, + "group_subtasks": { + "gpqa_main_zeroshot": [] + }, + "configs": { + "gpqa_main_zeroshot": { + "task": "gpqa_main_zeroshot", + "tag": "gpqa", + "dataset_path": "Idavidrein/gpqa", + "dataset_name": "gpqa_main", + "training_split": "train", + "validation_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "(A)", + "(B)", + "(C)", + "(D)" + ], + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "gpqa_main_zeroshot": 1.0 + }, + "n-shot": { + "gpqa_main_zeroshot": 0 + }, + "higher_is_better": { + "gpqa_main_zeroshot": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "gpqa_main_zeroshot": { + "original": 448, + "effective": 448 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756410896.6452813, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6756616.943976382, + "end_time": 6756956.827091952, + "total_evaluation_time_seconds": "339.8831155700609" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-20-30.918649.json b/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-20-30.918649.json new file mode 100644 index 0000000000000000000000000000000000000000..fc621e9b5f74812044453553cbb11dd3b9b64fa8 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-20-30.918649.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4963154750049791, + "acc_stderr,none": 0.004989645929811662, + "acc_norm,none": 0.6443935471021709, + "acc_norm_stderr,none": 0.004777183508950096 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756400847.578443, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6746557.510660827, + "end_time": 6747421.039241918, + "total_evaluation_time_seconds": "863.5285810912028" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-09-44.127975.json b/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-09-44.127975.json new file mode 100644 index 0000000000000000000000000000000000000000..bf022a842738682b3f7ed49384da839e2877d243 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/hellaswag-10/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-09-44.127975.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.5203146783509262, + "acc_stderr,none": 0.0049856612829986555, + "acc_norm,none": 0.6973710416251743, + "acc_norm_stderr,none": 0.0045845711025984505 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 28, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756401745.5516791, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6747469.343793878, + "end_time": 6753974.248757057, + "total_evaluation_time_seconds": "6504.90496317856" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-29T12-24-07.631836.json b/results/llama-3.2-3b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-29T12-24-07.631836.json new file mode 100644 index 0000000000000000000000000000000000000000..246ae5f233490f7ff2ee07e6928817076f13f724 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/ifeval/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-29T12-24-07.631836.json @@ -0,0 +1,141 @@ +{ + "results": { + "ifeval": { + "alias": "ifeval", + "prompt_level_strict_acc,none": 0.6469500924214417, + "prompt_level_strict_acc_stderr,none": 0.020566318668824704, + "inst_level_strict_acc,none": 0.7434052757793765, + "inst_level_strict_acc_stderr,none": "N/A", + "prompt_level_loose_acc,none": 0.6913123844731978, + "prompt_level_loose_acc_stderr,none": 0.019879245251116444, + "inst_level_loose_acc,none": 0.7817745803357314, + "inst_level_loose_acc_stderr,none": "N/A" + } + }, + "group_subtasks": { + "ifeval": [] + }, + "configs": { + "ifeval": { + "task": "ifeval", + "dataset_path": "google/IFEval", + "test_split": "train", + "doc_to_text": "prompt", + "doc_to_target": 0, + "unsafe_code": false, + "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "prompt_level_strict_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_strict_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + }, + { + "metric": "prompt_level_loose_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "inst_level_loose_acc", + "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [], + "do_sample": false, + "temperature": 0.0, + "max_gen_toks": 1280 + }, + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 4.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "ifeval": 4.0 + }, + "n-shot": { + "ifeval": 0 + }, + "higher_is_better": { + "ifeval": { + "prompt_level_strict_acc": true, + "inst_level_strict_acc": true, + "prompt_level_loose_acc": true, + "inst_level_loose_acc": true + } + }, + "n-samples": { + "ifeval": { + "original": 541, + "effective": 541 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756467999.8379674, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6813670.121477705, + "end_time": 6816037.752682229, + "total_evaluation_time_seconds": "2367.631204523146" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-05-01.612721.json b/results/llama-3.2-3b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-05-01.612721.json new file mode 100644 index 0000000000000000000000000000000000000000..762f6c843cb7cb8c89fdd6b0568c0a6db6964d05 --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/mmlu-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T17-05-01.612721.json @@ -0,0 +1,3516 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.592436974789916, + "acc_stderr,none": 0.003983667966646353, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.5589798087141339, + "acc_stderr,none": 0.006988471944056959, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.42063492063492064, + "acc_stderr,none": 0.044154382267437474 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7454545454545455, + "acc_stderr,none": 0.03401506715249038 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.7205882352941176, + "acc_stderr,none": 0.031493281045079584 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7805907172995781, + "acc_stderr,none": 0.026939106581553945 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6859504132231405, + "acc_stderr,none": 0.04236964753041019 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.6666666666666666, + "acc_stderr,none": 0.04557239513497755 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6687116564417178, + "acc_stderr,none": 0.03697983910025588 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.6705202312138728, + "acc_stderr,none": 0.02530525813187972 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.47374301675977654, + "acc_stderr,none": 0.016699427672784813 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6752411575562701, + "acc_stderr,none": 0.02659678228769707 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.6574074074074074, + "acc_stderr,none": 0.0264061459736257 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.42503259452411996, + "acc_stderr,none": 0.012625879884891987 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7426900584795322, + "acc_stderr,none": 0.03352799844161867 + }, + "mmlu_other": { + "acc,none": 0.660122304473769, + "acc_stderr,none": 0.00820535627597423, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.59, + "acc_stderr,none": 0.04943110704237104 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6452830188679245, + "acc_stderr,none": 0.029445175328199655 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6011560693641619, + "acc_stderr,none": 0.03733626655383514 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.28, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6188340807174888, + "acc_stderr,none": 0.0325962511841683 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7669902912621359, + "acc_stderr,none": 0.041858325989283136 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8547008547008547, + "acc_stderr,none": 0.02308663508684137 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.75, + "acc_stderr,none": 0.04351941398892446 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.7611749680715197, + "acc_stderr,none": 0.015246803197398642 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.673202614379085, + "acc_stderr,none": 0.026857294663281482 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.46808510638297873, + "acc_stderr,none": 0.02976667507587383 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.6654411764705882, + "acc_stderr,none": 0.028661996202335335 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4939759036144578, + "acc_stderr,none": 0.03892212195333041 + }, + "mmlu_social_sciences": { + "acc,none": 0.663958401039974, + "acc_stderr,none": 0.008311992163009008, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.39473684210526316, + "acc_stderr,none": 0.04598188057816546 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7878787878787878, + "acc_stderr,none": 0.029126522834586777 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8238341968911918, + "acc_stderr,none": 0.02749350424454809 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.5769230769230769, + "acc_stderr,none": 0.025049197876042397 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6008403361344538, + "acc_stderr,none": 0.031811100324139197 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.763302752293578, + "acc_stderr,none": 0.018224078117299015 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6564885496183206, + "acc_stderr,none": 0.041649760719448814 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.5718954248366013, + "acc_stderr,none": 0.020017629214213063 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5818181818181818, + "acc_stderr,none": 0.04724577405731571 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6857142857142857, + "acc_stderr,none": 0.029719329422417437 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7512437810945274, + "acc_stderr,none": 0.030567675938916686 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036849 + }, + "mmlu_stem": { + "acc,none": 0.5058674278464954, + "acc_stderr,none": 0.00865063781886162, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.562962962962963, + "acc_stderr,none": 0.042849586397534056 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.6118421052631579, + "acc_stderr,none": 0.03965842097512745 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.7083333333333334, + "acc_stderr,none": 0.03800968060554863 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.45, + "acc_stderr,none": 0.05 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.49, + "acc_stderr,none": 0.05024183937956913 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.32, + "acc_stderr,none": 0.04688261722621507 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3431372549019608, + "acc_stderr,none": 0.04724007352383884 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.63, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.5276595744680851, + "acc_stderr,none": 0.03263597118409762 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5310344827586206, + "acc_stderr,none": 0.041586327620978254 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.4126984126984127, + "acc_stderr,none": 0.025355741263055214 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.7225806451612903, + "acc_stderr,none": 0.02547019683590008 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.541871921182266, + "acc_stderr,none": 0.035056301407857406 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37777777777777777, + "acc_stderr,none": 0.029560707392465774 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3973509933774834, + "acc_stderr,none": 0.039955240076816834 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5324074074074074, + "acc_stderr,none": 0.03402801581358966 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3482142857142857, + "acc_stderr,none": 0.04521829902833581 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.592436974789916, + "acc_stderr,none": 0.003983667966646353, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.5589798087141339, + "acc_stderr,none": 0.006988471944056959, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.660122304473769, + "acc_stderr,none": 0.00820535627597423, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.663958401039974, + "acc_stderr,none": 0.008311992163009008, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.5058674278464954, + "acc_stderr,none": 0.00865063781886162, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_moral_disputes", + "mmlu_formal_logic", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_high_school_european_history", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios" + ], + "mmlu_social_sciences": [ + "mmlu_econometrics", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology" + ], + "mmlu_other": [ + "mmlu_miscellaneous", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_global_facts", + "mmlu_management", + "mmlu_human_aging", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics", + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition" + ], + "mmlu_stem": [ + "mmlu_high_school_biology", + "mmlu_machine_learning", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_high_school_statistics", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_college_physics", + "mmlu_high_school_computer_science", + "mmlu_college_mathematics", + "mmlu_high_school_chemistry", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": 1, + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756398431.3320112, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6744139.604738178, + "end_time": 6746491.732498547, + "total_evaluation_time_seconds": "2352.1277603693306" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-16-48.102547.json b/results/llama-3.2-3b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-16-48.102547.json new file mode 100644 index 0000000000000000000000000000000000000000..53d3198d271e9f24aaceb488ad35a596f8d5bbbe --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/piqa-0/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-16-48.102547.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.750272034820457, + "acc_stderr,none": 0.010099232969867361, + "acc_norm,none": 0.7535364526659413, + "acc_norm_stderr,none": 0.01005481078967165 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756408434.2433283, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6754086.151275051, + "end_time": 6754398.223271606, + "total_evaluation_time_seconds": "312.0719965547323" +} \ No newline at end of file diff --git a/results/llama-3.2-3b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-52-55.020937.json b/results/llama-3.2-3b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-52-55.020937.json new file mode 100644 index 0000000000000000000000000000000000000000..7d6dad427386c92991759f31098c89134e7dfcbf --- /dev/null +++ b/results/llama-3.2-3b-instruct-q3_k_m/triviaqa-5/skymizer__Llama-3.2-3B-Instruct-GGUF/results_2025-08-28T19-52-55.020937.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.4227596968345965, + "exact_match_stderr,remove_whitespace": 0.003687886812767444 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "gguf_file": "llama-3.2-3b-instruct-q3_k_m.gguf", + "tokenizer": "meta-llama/Llama-3.2-3B-Instruct" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Llama-3.2-3B-Instruct-GGUF,gguf_file=llama-3.2-3b-instruct-q3_k_m.gguf,tokenizer=meta-llama/Llama-3.2-3B-Instruct", + "model_num_parameters": 3212749824, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "56afa4fd9a3d7846d5411f245aeeae5a01448751", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.1", + "date": 1756408843.4706702, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_eos_token": [ + "<|eot_id|>", + "128009" + ], + "tokenizer_bos_token": [ + "<|begin_of_text|>", + "128000" + ], + "eot_token_id": 128009, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Llama-3.2-3B-Instruct-GGUF", + "model_name_sanitized": "skymizer__Llama-3.2-3B-Instruct-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "chat_template_sha": "5816fce10444e03c2e9ee1ef8a4a1ea61ae7e69e438613f3b17b69d0426223a4", + "start_time": 6754495.65559894, + "end_time": 6756565.141713668, + "total_evaluation_time_seconds": "2069.486114727333" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T10-06-30.750516.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T10-06-30.750516.json new file mode 100644 index 0000000000000000000000000000000000000000..eacf53458a4164a1693f254a50ed6c2bd1da1e47 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T10-06-30.750516.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4794861581358295, + "acc_stderr,none": 0.004985580065946382, + "acc_norm,none": 0.6485759808803028, + "acc_norm_stderr,none": 0.004764393985110763 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756116103.8522475, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10247468.572308635, + "end_time": 10247814.188507237, + "total_evaluation_time_seconds": "345.6161986012012" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T18-52-01.785618.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T18-52-01.785618.json new file mode 100644 index 0000000000000000000000000000000000000000..6fde2c182a9b3297161169240c77659619d8a569 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T18-52-01.785618.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4794861581358295, + "acc_stderr,none": 0.004985580065946382, + "acc_norm,none": 0.6485759808803028, + "acc_norm_stderr,none": 0.004764393985110763 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756147638.463722, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10279003.58916339, + "end_time": 10279345.224245518, + "total_evaluation_time_seconds": "341.6350821275264" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T17-08-52.296422.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T17-08-52.296422.json new file mode 100644 index 0000000000000000000000000000000000000000..d40d650c8edcc8bad432268812e055b343e969a2 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T17-08-52.296422.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4794861581358295, + "acc_stderr,none": 0.004985580065946382, + "acc_norm,none": 0.6485759808803028, + "acc_norm_stderr,none": 0.004764393985110763 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756227834.0062761, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10359199.519020038, + "end_time": 10359555.73482078, + "total_evaluation_time_seconds": "356.2158007416874" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T10-55-04.441756.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T10-55-04.441756.json new file mode 100644 index 0000000000000000000000000000000000000000..3d8ec1e6cf993e2dc759dc9fb4b54db066546369 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T10-55-04.441756.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.47251543517227645, + "acc_stderr,none": 0.00498223713340941, + "acc_norm,none": 0.6488747261501693, + "acc_norm_stderr,none": 0.00476346513903847 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756116495.5687602, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10247861.060375849, + "end_time": 10250727.878715975, + "total_evaluation_time_seconds": "2866.818340126425" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T19-40-45.095333.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T19-40-45.095333.json new file mode 100644 index 0000000000000000000000000000000000000000..678a753e39ab56c16b4f5cc7a6238ae01b0b1229 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T19-40-45.095333.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.47251543517227645, + "acc_stderr,none": 0.00498223713340941, + "acc_norm,none": 0.6488747261501693, + "acc_norm_stderr,none": 0.00476346513903847 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756148024.0348716, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10279388.826472867, + "end_time": 10282268.534040028, + "total_evaluation_time_seconds": "2879.707567160949" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T17-57-48.202942.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T17-57-48.202942.json new file mode 100644 index 0000000000000000000000000000000000000000..0606c446eb5f1ed23674d6ae1297bc1e3be58937 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T17-57-48.202942.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.47251543517227645, + "acc_stderr,none": 0.00498223713340941, + "acc_norm,none": 0.6488747261501693, + "acc_norm_stderr,none": 0.00476346513903847 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756228234.1213133, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10359599.284620412, + "end_time": 10362491.641646843, + "total_evaluation_time_seconds": "2892.3570264317095" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T10-00-00.909224.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T10-00-00.909224.json new file mode 100644 index 0000000000000000000000000000000000000000..c13d0a246d8ec9e2390507a8e54d637b90c9acaa --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T10-00-00.909224.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.42857142857142855, + "acc_stderr,none": 0.04426266681379905 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.703030303030303, + "acc_stderr,none": 0.03567969772268046 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.6813725490196079, + "acc_stderr,none": 0.032702871814820796 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7046413502109705, + "acc_stderr,none": 0.029696338713422813 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.71900826446281, + "acc_stderr,none": 0.04103203830514515 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7407407407407407, + "acc_stderr,none": 0.04236511258094632 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6932515337423313, + "acc_stderr,none": 0.036230899157241474 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.5664739884393064, + "acc_stderr,none": 0.02668013476167924 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6334405144694534, + "acc_stderr,none": 0.027368078243971635 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.5864197530864198, + "acc_stderr,none": 0.027402042040270014 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.40808344198174706, + "acc_stderr,none": 0.01255259895856359 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7368421052631579, + "acc_stderr,none": 0.033773102522091994 + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.66, + "acc_stderr,none": 0.04760952285695234 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6377358490566037, + "acc_stderr,none": 0.029582245128384355 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6242774566473989, + "acc_stderr,none": 0.03692820767264865 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6322869955156951, + "acc_stderr,none": 0.03236198350928281 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7475728155339806, + "acc_stderr,none": 0.04301250399690879 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7905982905982906, + "acc_stderr,none": 0.026655699653922706 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.6, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.6756066411238825, + "acc_stderr,none": 0.016740929047162616 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.6339869281045751, + "acc_stderr,none": 0.027582811415159628 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.4148936170212766, + "acc_stderr,none": 0.02939223658461248 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4852941176470588, + "acc_stderr,none": 0.030359697079046163 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4819277108433735, + "acc_stderr,none": 0.03889951252827222 + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.41228070175438597, + "acc_stderr,none": 0.046306532033665936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.03191178226713548 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.7150259067357513, + "acc_stderr,none": 0.032577140777096614 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6153846153846154, + "acc_stderr,none": 0.02466674491518715 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6512605042016807, + "acc_stderr,none": 0.03095663632856658 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.7669724770642202, + "acc_stderr,none": 0.018125669180861538 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6641221374045801, + "acc_stderr,none": 0.04142313771996665 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.020102583895887222 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6545454545454545, + "acc_stderr,none": 0.045546196175410524 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6326530612244898, + "acc_stderr,none": 0.030862144921087586 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7164179104477612, + "acc_stderr,none": 0.031871875379197945 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.45925925925925926, + "acc_stderr,none": 0.04304979692464244 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.6052631578947368, + "acc_stderr,none": 0.039777499346220775 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.6458333333333334, + "acc_stderr,none": 0.03999411135753542 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.43, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.43137254901960786, + "acc_stderr,none": 0.049280995972875316 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.73, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.548936170212766, + "acc_stderr,none": 0.032529096196132014 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6344827586206897, + "acc_stderr,none": 0.04013124195424389 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.46825396825396826, + "acc_stderr,none": 0.02569935283213174 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.6741935483870968, + "acc_stderr,none": 0.026662010578567125 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.46798029556650245, + "acc_stderr,none": 0.03510766597959214 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.34444444444444444, + "acc_stderr,none": 0.02897264888484431 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3576158940397351, + "acc_stderr,none": 0.0391345343117726 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4861111111111111, + "acc_stderr,none": 0.03408655867977753 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3392857142857143, + "acc_stderr,none": 0.04493949068613541 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 36, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756114758.8735406, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10246123.118201349, + "end_time": 10247424.347304564, + "total_evaluation_time_seconds": "1301.2291032150388" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T18-45-36.117711.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T18-45-36.117711.json new file mode 100644 index 0000000000000000000000000000000000000000..b2d02f2f8faf8cd1ab693bd17eff9bf3555782a6 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T18-45-36.117711.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.42857142857142855, + "acc_stderr,none": 0.04426266681379905 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.703030303030303, + "acc_stderr,none": 0.03567969772268046 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.6813725490196079, + "acc_stderr,none": 0.032702871814820796 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7046413502109705, + "acc_stderr,none": 0.029696338713422813 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.71900826446281, + "acc_stderr,none": 0.04103203830514515 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7407407407407407, + "acc_stderr,none": 0.04236511258094632 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6932515337423313, + "acc_stderr,none": 0.036230899157241474 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.5664739884393064, + "acc_stderr,none": 0.02668013476167924 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6334405144694534, + "acc_stderr,none": 0.027368078243971635 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.5864197530864198, + "acc_stderr,none": 0.027402042040270014 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.40808344198174706, + "acc_stderr,none": 0.01255259895856359 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7368421052631579, + "acc_stderr,none": 0.033773102522091994 + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.66, + "acc_stderr,none": 0.04760952285695234 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6377358490566037, + "acc_stderr,none": 0.029582245128384355 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6242774566473989, + "acc_stderr,none": 0.03692820767264865 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6322869955156951, + "acc_stderr,none": 0.03236198350928281 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7475728155339806, + "acc_stderr,none": 0.04301250399690879 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7905982905982906, + "acc_stderr,none": 0.026655699653922706 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.6, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.6756066411238825, + "acc_stderr,none": 0.016740929047162616 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.6339869281045751, + "acc_stderr,none": 0.027582811415159628 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.4148936170212766, + "acc_stderr,none": 0.02939223658461248 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4852941176470588, + "acc_stderr,none": 0.030359697079046163 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4819277108433735, + "acc_stderr,none": 0.03889951252827222 + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.41228070175438597, + "acc_stderr,none": 0.046306532033665936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.03191178226713548 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.7150259067357513, + "acc_stderr,none": 0.032577140777096614 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6153846153846154, + "acc_stderr,none": 0.02466674491518715 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6512605042016807, + "acc_stderr,none": 0.03095663632856658 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.7669724770642202, + "acc_stderr,none": 0.018125669180861538 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6641221374045801, + "acc_stderr,none": 0.04142313771996665 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.020102583895887222 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6545454545454545, + "acc_stderr,none": 0.045546196175410524 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6326530612244898, + "acc_stderr,none": 0.030862144921087586 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7164179104477612, + "acc_stderr,none": 0.031871875379197945 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.45925925925925926, + "acc_stderr,none": 0.04304979692464244 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.6052631578947368, + "acc_stderr,none": 0.039777499346220775 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.6458333333333334, + "acc_stderr,none": 0.03999411135753542 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.43, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.43137254901960786, + "acc_stderr,none": 0.049280995972875316 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.73, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.548936170212766, + "acc_stderr,none": 0.032529096196132014 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6344827586206897, + "acc_stderr,none": 0.04013124195424389 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.46825396825396826, + "acc_stderr,none": 0.02569935283213174 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.6741935483870968, + "acc_stderr,none": 0.026662010578567125 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.46798029556650245, + "acc_stderr,none": 0.03510766597959214 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.34444444444444444, + "acc_stderr,none": 0.02897264888484431 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3576158940397351, + "acc_stderr,none": 0.0391345343117726 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4861111111111111, + "acc_stderr,none": 0.03408655867977753 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3392857142857143, + "acc_stderr,none": 0.04493949068613541 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 36, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756146276.2177389, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10277640.3910733, + "end_time": 10278959.555775277, + "total_evaluation_time_seconds": "1319.1647019777447" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T17-02-12.668619.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T17-02-12.668619.json new file mode 100644 index 0000000000000000000000000000000000000000..d5909544387a8032dc64641d8d812957eedab9df --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T17-02-12.668619.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.42857142857142855, + "acc_stderr,none": 0.04426266681379905 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.703030303030303, + "acc_stderr,none": 0.03567969772268046 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.6813725490196079, + "acc_stderr,none": 0.032702871814820796 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7046413502109705, + "acc_stderr,none": 0.029696338713422813 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.71900826446281, + "acc_stderr,none": 0.04103203830514515 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7407407407407407, + "acc_stderr,none": 0.04236511258094632 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6932515337423313, + "acc_stderr,none": 0.036230899157241474 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.5664739884393064, + "acc_stderr,none": 0.02668013476167924 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.24692737430167597, + "acc_stderr,none": 0.014422292204808857 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6334405144694534, + "acc_stderr,none": 0.027368078243971635 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.5864197530864198, + "acc_stderr,none": 0.027402042040270014 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.40808344198174706, + "acc_stderr,none": 0.01255259895856359 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7368421052631579, + "acc_stderr,none": 0.033773102522091994 + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.66, + "acc_stderr,none": 0.04760952285695234 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6377358490566037, + "acc_stderr,none": 0.029582245128384355 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6242774566473989, + "acc_stderr,none": 0.03692820767264865 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.36, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6322869955156951, + "acc_stderr,none": 0.03236198350928281 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7475728155339806, + "acc_stderr,none": 0.04301250399690879 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7905982905982906, + "acc_stderr,none": 0.026655699653922706 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.6, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.6756066411238825, + "acc_stderr,none": 0.016740929047162616 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.6339869281045751, + "acc_stderr,none": 0.027582811415159628 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.4148936170212766, + "acc_stderr,none": 0.02939223658461248 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4852941176470588, + "acc_stderr,none": 0.030359697079046163 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4819277108433735, + "acc_stderr,none": 0.03889951252827222 + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.41228070175438597, + "acc_stderr,none": 0.046306532033665936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.03191178226713548 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.7150259067357513, + "acc_stderr,none": 0.032577140777096614 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6153846153846154, + "acc_stderr,none": 0.02466674491518715 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6512605042016807, + "acc_stderr,none": 0.03095663632856658 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.7669724770642202, + "acc_stderr,none": 0.018125669180861538 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6641221374045801, + "acc_stderr,none": 0.04142313771996665 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.020102583895887222 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6545454545454545, + "acc_stderr,none": 0.045546196175410524 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6326530612244898, + "acc_stderr,none": 0.030862144921087586 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7164179104477612, + "acc_stderr,none": 0.031871875379197945 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.27, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.45925925925925926, + "acc_stderr,none": 0.04304979692464244 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.6052631578947368, + "acc_stderr,none": 0.039777499346220775 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.6458333333333334, + "acc_stderr,none": 0.03999411135753542 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.43, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.37, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.43137254901960786, + "acc_stderr,none": 0.049280995972875316 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.73, + "acc_stderr,none": 0.04461960433384737 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.548936170212766, + "acc_stderr,none": 0.032529096196132014 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6344827586206897, + "acc_stderr,none": 0.04013124195424389 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.46825396825396826, + "acc_stderr,none": 0.02569935283213174 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.6741935483870968, + "acc_stderr,none": 0.026662010578567125 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.46798029556650245, + "acc_stderr,none": 0.03510766597959214 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.34444444444444444, + "acc_stderr,none": 0.02897264888484431 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3576158940397351, + "acc_stderr,none": 0.0391345343117726 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4861111111111111, + "acc_stderr,none": 0.03408655867977753 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3392857142857143, + "acc_stderr,none": 0.04493949068613541 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.5547642785927931, + "acc_stderr,none": 0.004012513195810285, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4913921360255048, + "acc_stderr,none": 0.006879051905115341, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.6095912455745092, + "acc_stderr,none": 0.008544041576531047, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.6548586285342867, + "acc_stderr,none": 0.008439086883322488, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.49762131303520457, + "acc_stderr,none": 0.008675060585438295, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 36, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756226497.0820615, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10357862.81134909, + "end_time": 10359156.106667291, + "total_evaluation_time_seconds": "1293.2953182011843" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T10-59-41.024347.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T10-59-41.024347.json new file mode 100644 index 0000000000000000000000000000000000000000..b01efb49edd0417f9a37948cbab27f93aed4073a --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T10-59-41.024347.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7415669205658324, + "acc_stderr,none": 0.010213971636773348, + "acc_norm,none": 0.7464635473340587, + "acc_norm_stderr,none": 0.010150090834551817 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756119428.7735612, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10250793.352108711, + "end_time": 10251004.462565055, + "total_evaluation_time_seconds": "211.11045634374022" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T19-43-21.008421.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T19-43-21.008421.json new file mode 100644 index 0000000000000000000000000000000000000000..a8271cce2611e178de6b1fac0284648ba7b44e8c --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T19-43-21.008421.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7415669205658324, + "acc_stderr,none": 0.010213971636773348, + "acc_norm,none": 0.7464635473340587, + "acc_norm_stderr,none": 0.010150090834551817 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756150951.464924, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10282314.340210706, + "end_time": 10282424.446657803, + "total_evaluation_time_seconds": "110.10644709691405" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T18-00-21.832934.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T18-00-21.832934.json new file mode 100644 index 0000000000000000000000000000000000000000..3a6a19051e1677d4f544c7338bff89ffcfe8b859 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T18-00-21.832934.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7415669205658324, + "acc_stderr,none": 0.010213971636773348, + "acc_norm,none": 0.7464635473340587, + "acc_norm_stderr,none": 0.010150090834551817 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756231170.7105067, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10362535.449745093, + "end_time": 10362645.271010853, + "total_evaluation_time_seconds": "109.82126576080918" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T11-21-55.426143.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T11-21-55.426143.json new file mode 100644 index 0000000000000000000000000000000000000000..5a63bc0dcd678242d140cf5d458261cb8ed72335 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T11-21-55.426143.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3354324565314311, + "exact_match_stderr,remove_whitespace": 0.0035247196231818468 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756119682.5519695, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10251047.70011653, + "end_time": 10252338.864046091, + "total_evaluation_time_seconds": "1291.163929561153" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T20-05-45.201810.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T20-05-45.201810.json new file mode 100644 index 0000000000000000000000000000000000000000..4db5ff62b8ec3bfadd673a1f48f487eb62e0ade5 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T20-05-45.201810.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3354324565314311, + "exact_match_stderr,remove_whitespace": 0.0035247196231818468 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756151105.6363664, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10282468.28029194, + "end_time": 10283768.640366916, + "total_evaluation_time_seconds": "1300.3600749764591" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T18-22-29.903081.json b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T18-22-29.903081.json new file mode 100644 index 0000000000000000000000000000000000000000..ddaab7a997b78a4dc3ae3b6d042aefe647d6a49f --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T18-22-29.903081.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.3354324565314311, + "exact_match_stderr,remove_whitespace": 0.0035247196231818468 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen2.5-1.5b-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen2.5-1.5b-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756231323.472232, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10362688.587081311, + "end_time": 10363973.341262477, + "total_evaluation_time_seconds": "1284.7541811652482" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m/hellaswag-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-22-52.924484.json b/results/qwen2.5-1.5b-q3_k_m/hellaswag-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-22-52.924484.json new file mode 100644 index 0000000000000000000000000000000000000000..69032be83165a2ec95faddcdbe4d7cc541264c0e --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m/hellaswag-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-22-52.924484.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.4794861581358295, + "acc_stderr,none": 0.004985580065946382, + "acc_norm,none": 0.6505676160127465, + "acc_norm_stderr,none": 0.0047581629679972854 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen2.5-1.5B-GGUF,gguf_file=qwen2.5-1.5b-q3_k_m.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b5b418f9ac2f3c2836e37f6c0637cbd0049f183e", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756196283.2100413, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen2.5-1.5B-GGUF", + "model_name_sanitized": "skymizer__Qwen2.5-1.5B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10327648.56469443, + "end_time": 10327996.362957267, + "total_evaluation_time_seconds": "347.7982628364116" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m/hellaswag-10/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-11-42.084552.json b/results/qwen2.5-1.5b-q3_k_m/hellaswag-10/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-11-42.084552.json new file mode 100644 index 0000000000000000000000000000000000000000..f9793e0d7c56f6e4c3cc8a51b6ba574d5f1ade7e --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m/hellaswag-10/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-11-42.084552.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.47460665206134234, + "acc_stderr,none": 0.0049833422137764375, + "acc_norm,none": 0.6474805815574587, + "acc_norm_stderr,none": 0.004767782256040978 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen2.5-1.5B-GGUF,gguf_file=qwen2.5-1.5b-q3_k_m.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b5b418f9ac2f3c2836e37f6c0637cbd0049f183e", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756196674.4784877, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen2.5-1.5B-GGUF", + "model_name_sanitized": "skymizer__Qwen2.5-1.5B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10328039.14289001, + "end_time": 10330925.523238152, + "total_evaluation_time_seconds": "2886.3803481422365" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m/mmlu-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-16-21.965989.json b/results/qwen2.5-1.5b-q3_k_m/mmlu-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-16-21.965989.json new file mode 100644 index 0000000000000000000000000000000000000000..c1528dbfd7d18a9b1afb0708f6733ea71640da10 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m/mmlu-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T08-16-21.965989.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.5510611024070645, + "acc_stderr,none": 0.004023219448224181, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4901168969181722, + "acc_stderr,none": 0.006910363415451104, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.4444444444444444, + "acc_stderr,none": 0.044444444444444446 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.696969696969697, + "acc_stderr,none": 0.035886248000917116 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.6764705882352942, + "acc_stderr,none": 0.0328347205610856 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7046413502109705, + "acc_stderr,none": 0.029696338713422813 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6776859504132231, + "acc_stderr,none": 0.04266416363352168 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.6851851851851852, + "acc_stderr,none": 0.0448993107359131 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.6993865030674846, + "acc_stderr,none": 0.036025113188067656 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.5751445086705202, + "acc_stderr,none": 0.0266133508402617 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2569832402234637, + "acc_stderr,none": 0.014614465821966405 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.6270096463022508, + "acc_stderr,none": 0.02746661021314009 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.5895061728395061, + "acc_stderr,none": 0.027371350925124785 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.4048239895697523, + "acc_stderr,none": 0.012536743830953968 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.7251461988304093, + "acc_stderr,none": 0.03424042924691578 + }, + "mmlu_other": { + "acc,none": 0.607981976182813, + "acc_stderr,none": 0.008538514563818182, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.67, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.630188679245283, + "acc_stderr,none": 0.029711421880108006 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.6184971098265896, + "acc_stderr,none": 0.03703851193099517 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6278026905829597, + "acc_stderr,none": 0.03244305283008735 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7475728155339806, + "acc_stderr,none": 0.04301250399690879 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7991452991452992, + "acc_stderr,none": 0.026246772946890443 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.6794380587484036, + "acc_stderr,none": 0.016688893310803803 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.6274509803921569, + "acc_stderr,none": 0.02768418188330291 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.42907801418439717, + "acc_stderr,none": 0.029525914302558586 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4632352941176471, + "acc_stderr,none": 0.03029061918048574 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4879518072289157, + "acc_stderr,none": 0.038913644958358196 + }, + "mmlu_social_sciences": { + "acc,none": 0.6467338316542086, + "acc_stderr,none": 0.00849417746178364, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.41228070175438597, + "acc_stderr,none": 0.046306532033665936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7121212121212122, + "acc_stderr,none": 0.03225883512300997 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.6994818652849741, + "acc_stderr,none": 0.033088185944157515 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.5948717948717949, + "acc_stderr,none": 0.024890471769938145 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6470588235294118, + "acc_stderr,none": 0.03104194130405932 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.7559633027522936, + "acc_stderr,none": 0.018415286351416447 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.6412213740458015, + "acc_stderr,none": 0.04206739313864908 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.5588235294117647, + "acc_stderr,none": 0.0200873620767029 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6090909090909091, + "acc_stderr,none": 0.046737523336702363 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.6326530612244898, + "acc_stderr,none": 0.030862144921087586 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.7213930348258707, + "acc_stderr,none": 0.031700561834973114 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909278 + }, + "mmlu_stem": { + "acc,none": 0.49254678084364095, + "acc_stderr,none": 0.008671986161985265, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.26, + "acc_stderr,none": 0.0440844002276808 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.45925925925925926, + "acc_stderr,none": 0.04304979692464244 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.5986842105263158, + "acc_stderr,none": 0.03988903703336284 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.6111111111111112, + "acc_stderr,none": 0.040766632539185714 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.4, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.35, + "acc_stderr,none": 0.04793724854411023 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.35294117647058826, + "acc_stderr,none": 0.04755129616062946 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.72, + "acc_stderr,none": 0.045126085985421296 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.5319148936170213, + "acc_stderr,none": 0.03261936918467376 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6413793103448275, + "acc_stderr,none": 0.03996629574876715 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.47354497354497355, + "acc_stderr,none": 0.025715239811346824 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.6741935483870968, + "acc_stderr,none": 0.026662010578567125 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.47783251231527096, + "acc_stderr,none": 0.03514528562175008 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.6, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.35185185185185186, + "acc_stderr,none": 0.02911661760608302 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3576158940397351, + "acc_stderr,none": 0.0391345343117726 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.48148148148148145, + "acc_stderr,none": 0.03407632093854052 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.3482142857142857, + "acc_stderr,none": 0.04521829902833581 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.5510611024070645, + "acc_stderr,none": 0.004023219448224181, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.4901168969181722, + "acc_stderr,none": 0.006910363415451104, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.607981976182813, + "acc_stderr,none": 0.008538514563818182, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.6467338316542086, + "acc_stderr,none": 0.00849417746178364, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.49254678084364095, + "acc_stderr,none": 0.008671986161985265, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen2.5-1.5B-GGUF,gguf_file=qwen2.5-1.5b-q3_k_m.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b5b418f9ac2f3c2836e37f6c0637cbd0049f183e", + "batch_size": "auto:4", + "batch_sizes": [ + 11, + 36, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756194931.9798706, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen2.5-1.5B-GGUF", + "model_name_sanitized": "skymizer__Qwen2.5-1.5B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10326295.040783761, + "end_time": 10327605.404048383, + "total_evaluation_time_seconds": "1310.3632646221668" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m/piqa-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-14-25.230848.json b/results/qwen2.5-1.5b-q3_k_m/piqa-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-14-25.230848.json new file mode 100644 index 0000000000000000000000000000000000000000..567bb27cf91af0ef4f6d9e38ae685750424b4af2 --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m/piqa-0/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-14-25.230848.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.7399347116430903, + "acc_stderr,none": 0.010234893249061136, + "acc_norm,none": 0.7480957562568009, + "acc_norm_stderr,none": 0.010128421335088757 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen2.5-1.5B-GGUF,gguf_file=qwen2.5-1.5b-q3_k_m.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b5b418f9ac2f3c2836e37f6c0637cbd0049f183e", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756199608.4986675, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen2.5-1.5B-GGUF", + "model_name_sanitized": "skymizer__Qwen2.5-1.5B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10330970.500595247, + "end_time": 10331088.669548504, + "total_evaluation_time_seconds": "118.16895325668156" +} \ No newline at end of file diff --git a/results/qwen2.5-1.5b-q3_k_m/triviaqa-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-36-34.590983.json b/results/qwen2.5-1.5b-q3_k_m/triviaqa-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-36-34.590983.json new file mode 100644 index 0000000000000000000000000000000000000000..5c0e65f0292f7401e34a87a3476a9200dfe7f4aa --- /dev/null +++ b/results/qwen2.5-1.5b-q3_k_m/triviaqa-5/skymizer__Qwen2.5-1.5B-GGUF/results_2025-08-26T09-36-34.590983.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.33660276415514934, + "exact_match_stderr,remove_whitespace": 0.003527752751543392 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Qwen2.5-1.5B-GGUF", + "gguf_file": "qwen2.5-1.5b-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen2.5-1.5B" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen2.5-1.5B-GGUF,gguf_file=qwen2.5-1.5b-q3_k_m.gguf,tokenizer=Qwen/Qwen2.5-1.5B", + "model_num_parameters": 1543714304, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "b5b418f9ac2f3c2836e37f6c0637cbd0049f183e", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756199769.2908065, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 131072, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen2.5-1.5B-GGUF", + "model_name_sanitized": "skymizer__Qwen2.5-1.5B-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10331132.6943363, + "end_time": 10332418.029650584, + "total_evaluation_time_seconds": "1285.3353142831475" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T07-10-08.326899.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T07-10-08.326899.json new file mode 100644 index 0000000000000000000000000000000000000000..9df9f8ddd2e0439360c0b58fe2b1e74254a44fec --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T07-10-08.326899.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.37024497112129057, + "acc_stderr,none": 0.004818833521340235, + "acc_norm,none": 0.47679745070703045, + "acc_norm_stderr,none": 0.004984405935541033 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756105636.8690147, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10237002.579090573, + "end_time": 10237231.764777293, + "total_evaluation_time_seconds": "229.18568672053516" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T15-54-37.795042.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T15-54-37.795042.json new file mode 100644 index 0000000000000000000000000000000000000000..1eb25af85e771411896dac7f5d1f1b6eb4a86e65 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-25T15-54-37.795042.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.37024497112129057, + "acc_stderr,none": 0.004818833521340235, + "acc_norm,none": 0.47679745070703045, + "acc_norm_stderr,none": 0.004984405935541033 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756137113.3362834, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10268477.363642804, + "end_time": 10268701.233468557, + "total_evaluation_time_seconds": "223.86982575245202" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T14-12-09.552617.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T14-12-09.552617.json new file mode 100644 index 0000000000000000000000000000000000000000..4a58d187af39a26ed864ad7112a662a870c6edd0 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-0/.__models__/results_2025-08-26T14-12-09.552617.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.37024497112129057, + "acc_stderr,none": 0.004818833521340235, + "acc_norm,none": 0.47679745070703045, + "acc_norm_stderr,none": 0.004984405935541033 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756217367.6014361, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10348732.967941888, + "end_time": 10348952.990674097, + "total_evaluation_time_seconds": "220.02273220941424" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T07-39-01.090440.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T07-39-01.090440.json new file mode 100644 index 0000000000000000000000000000000000000000..a2805e199e2cb825af999bf42bd6c2eaed68819d --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T07-39-01.090440.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.3676558454491137, + "acc_stderr,none": 0.0048118159593889594, + "acc_norm,none": 0.47460665206134234, + "acc_norm_stderr,none": 0.0049833422137764375 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756105914.514641, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10237277.79024179, + "end_time": 10238964.527726904, + "total_evaluation_time_seconds": "1686.737485114485" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T16-23-45.179101.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T16-23-45.179101.json new file mode 100644 index 0000000000000000000000000000000000000000..f5076af6ad7a32327fbac43efa00b1084071cb50 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-25T16-23-45.179101.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.3676558454491137, + "acc_stderr,none": 0.0048118159593889594, + "acc_norm,none": 0.47460665206134234, + "acc_norm_stderr,none": 0.0049833422137764375 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756137410.2098243, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10268754.765098976, + "end_time": 10270448.617354188, + "total_evaluation_time_seconds": "1693.852255212143" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T14-40-46.776650.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T14-40-46.776650.json new file mode 100644 index 0000000000000000000000000000000000000000..cab89198ded927f591b1e84b7c9cd98b318e6865 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/hellaswag-10/.__models__/results_2025-08-26T14-40-46.776650.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.3676558454491137, + "acc_stderr,none": 0.0048118159593889594, + "acc_norm,none": 0.47460665206134234, + "acc_norm_stderr,none": 0.0049833422137764375 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756217630.4257221, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10348995.917466177, + "end_time": 10350670.21377758, + "total_evaluation_time_seconds": "1674.2963114026934" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T07-05-34.679595.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T07-05-34.679595.json new file mode 100644 index 0000000000000000000000000000000000000000..bd6c82d63807ab45d0c41b35b0089ced6043ddda --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T07-05-34.679595.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.30952380952380953, + "acc_stderr,none": 0.04134913018303316 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5696969696969697, + "acc_stderr,none": 0.03866225962879074 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.45098039215686275, + "acc_stderr,none": 0.034924061041636124 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.5443037974683544, + "acc_stderr,none": 0.03241920684693335 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6776859504132231, + "acc_stderr,none": 0.04266416363352168 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.46296296296296297, + "acc_stderr,none": 0.04820403072760628 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.43558282208588955, + "acc_stderr,none": 0.03895632464138937 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.49421965317919075, + "acc_stderr,none": 0.026917296179149057 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2569832402234637, + "acc_stderr,none": 0.014614465821966405 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.4983922829581994, + "acc_stderr,none": 0.02839794490780658 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.47530864197530864, + "acc_stderr,none": 0.027786800931427418 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3324641460234681, + "acc_stderr,none": 0.01203202233226046 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5087719298245614, + "acc_stderr,none": 0.038342347441649854 + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.5056603773584906, + "acc_stderr,none": 0.030770900763851323 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.4682080924855491, + "acc_stderr,none": 0.03804749744364767 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.452914798206278, + "acc_stderr,none": 0.03340867501923327 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5339805825242718, + "acc_stderr,none": 0.049392914472734785 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.029343114798094483 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.63, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5146871008939975, + "acc_stderr,none": 0.017872248024429188 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5228758169934641, + "acc_stderr,none": 0.028599936776089786 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.36879432624113473, + "acc_stderr,none": 0.028782227561347212 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4411764705882353, + "acc_stderr,none": 0.030161911930767053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.39156626506024095, + "acc_stderr,none": 0.037998574544796396 + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04434600701584929 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5404040404040404, + "acc_stderr,none": 0.03550702465131341 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5284974093264249, + "acc_stderr,none": 0.03602573571288441 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.4025641025641026, + "acc_stderr,none": 0.02486499515976774 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.47478991596638653, + "acc_stderr,none": 0.03243718055137406 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.5963302752293578, + "acc_stderr,none": 0.021035704856575025 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.5343511450381679, + "acc_stderr,none": 0.043749285605997376 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.41830065359477125, + "acc_stderr,none": 0.01995597514583561 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5181818181818182, + "acc_stderr,none": 0.04785964010794916 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.5224489795918368, + "acc_stderr,none": 0.03197694118713664 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.6069651741293532, + "acc_stderr,none": 0.03453682466031563 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.4444444444444444, + "acc_stderr,none": 0.04292596718256977 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.4473684210526316, + "acc_stderr,none": 0.040463368839782535 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.04155319955593143 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.41, + "acc_stderr,none": 0.04943110704237104 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04690650298201946 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.43829787234042555, + "acc_stderr,none": 0.03243618636108098 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5586206896551724, + "acc_stderr,none": 0.041379310344827586 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.35714285714285715, + "acc_stderr,none": 0.024677862841332786 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.5774193548387097, + "acc_stderr,none": 0.02810096472427264 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5024630541871922, + "acc_stderr,none": 0.035179450386910595 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.57, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.2962962962962963, + "acc_stderr,none": 0.027840811495871948 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3443708609271523, + "acc_stderr,none": 0.038796870240733264 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4074074074074074, + "acc_stderr,none": 0.03350991604696042 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.41964285714285715, + "acc_stderr,none": 0.04684099321077107 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 45, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756104532.0902948, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10235884.381060034, + "end_time": 10236958.117728332, + "total_evaluation_time_seconds": "1073.736668298021" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T15-50-09.213747.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T15-50-09.213747.json new file mode 100644 index 0000000000000000000000000000000000000000..d60fe25ed105c32cbbae0b21d71a36e8c65b90f0 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-25T15-50-09.213747.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.30952380952380953, + "acc_stderr,none": 0.04134913018303316 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5696969696969697, + "acc_stderr,none": 0.03866225962879074 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.45098039215686275, + "acc_stderr,none": 0.034924061041636124 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.5443037974683544, + "acc_stderr,none": 0.03241920684693335 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6776859504132231, + "acc_stderr,none": 0.04266416363352168 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.46296296296296297, + "acc_stderr,none": 0.04820403072760628 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.43558282208588955, + "acc_stderr,none": 0.03895632464138937 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.49421965317919075, + "acc_stderr,none": 0.026917296179149057 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2569832402234637, + "acc_stderr,none": 0.014614465821966405 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.4983922829581994, + "acc_stderr,none": 0.02839794490780658 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.47530864197530864, + "acc_stderr,none": 0.027786800931427418 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3324641460234681, + "acc_stderr,none": 0.01203202233226046 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5087719298245614, + "acc_stderr,none": 0.038342347441649854 + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.5056603773584906, + "acc_stderr,none": 0.030770900763851323 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.4682080924855491, + "acc_stderr,none": 0.03804749744364767 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.452914798206278, + "acc_stderr,none": 0.03340867501923327 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5339805825242718, + "acc_stderr,none": 0.049392914472734785 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.029343114798094483 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.63, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5146871008939975, + "acc_stderr,none": 0.017872248024429188 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5228758169934641, + "acc_stderr,none": 0.028599936776089786 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.36879432624113473, + "acc_stderr,none": 0.028782227561347212 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4411764705882353, + "acc_stderr,none": 0.030161911930767053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.39156626506024095, + "acc_stderr,none": 0.037998574544796396 + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04434600701584929 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5404040404040404, + "acc_stderr,none": 0.03550702465131341 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5284974093264249, + "acc_stderr,none": 0.03602573571288441 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.4025641025641026, + "acc_stderr,none": 0.02486499515976774 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.47478991596638653, + "acc_stderr,none": 0.03243718055137406 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.5963302752293578, + "acc_stderr,none": 0.021035704856575025 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.5343511450381679, + "acc_stderr,none": 0.043749285605997376 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.41830065359477125, + "acc_stderr,none": 0.01995597514583561 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5181818181818182, + "acc_stderr,none": 0.04785964010794916 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.5224489795918368, + "acc_stderr,none": 0.03197694118713664 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.6069651741293532, + "acc_stderr,none": 0.03453682466031563 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.4444444444444444, + "acc_stderr,none": 0.04292596718256977 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.4473684210526316, + "acc_stderr,none": 0.040463368839782535 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.04155319955593143 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.41, + "acc_stderr,none": 0.04943110704237104 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04690650298201946 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.43829787234042555, + "acc_stderr,none": 0.03243618636108098 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5586206896551724, + "acc_stderr,none": 0.041379310344827586 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.35714285714285715, + "acc_stderr,none": 0.024677862841332786 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.5774193548387097, + "acc_stderr,none": 0.02810096472427264 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5024630541871922, + "acc_stderr,none": 0.035179450386910595 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.57, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.2962962962962963, + "acc_stderr,none": 0.027840811495871948 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3443708609271523, + "acc_stderr,none": 0.038796870240733264 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4074074074074074, + "acc_stderr,none": 0.03350991604696042 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.41964285714285715, + "acc_stderr,none": 0.04684099321077107 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 45, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756136205.884102, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10267570.325025165, + "end_time": 10268432.651809271, + "total_evaluation_time_seconds": "862.3267841059715" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T14-07-45.954104.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T14-07-45.954104.json new file mode 100644 index 0000000000000000000000000000000000000000..87733196baae662b2f21af48489b8414e11c427d --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/mmlu-5/.__models__/results_2025-08-26T14-07-45.954104.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.30952380952380953, + "acc_stderr,none": 0.04134913018303316 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5696969696969697, + "acc_stderr,none": 0.03866225962879074 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.45098039215686275, + "acc_stderr,none": 0.034924061041636124 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.5443037974683544, + "acc_stderr,none": 0.03241920684693335 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6776859504132231, + "acc_stderr,none": 0.04266416363352168 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.46296296296296297, + "acc_stderr,none": 0.04820403072760628 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.43558282208588955, + "acc_stderr,none": 0.03895632464138937 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.49421965317919075, + "acc_stderr,none": 0.026917296179149057 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.2569832402234637, + "acc_stderr,none": 0.014614465821966405 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.4983922829581994, + "acc_stderr,none": 0.02839794490780658 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.47530864197530864, + "acc_stderr,none": 0.027786800931427418 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3324641460234681, + "acc_stderr,none": 0.01203202233226046 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5087719298245614, + "acc_stderr,none": 0.038342347441649854 + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.48, + "acc_stderr,none": 0.05021167315686783 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.5056603773584906, + "acc_stderr,none": 0.030770900763851323 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.4682080924855491, + "acc_stderr,none": 0.03804749744364767 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.29, + "acc_stderr,none": 0.045604802157206865 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.452914798206278, + "acc_stderr,none": 0.03340867501923327 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5339805825242718, + "acc_stderr,none": 0.049392914472734785 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7222222222222222, + "acc_stderr,none": 0.029343114798094483 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.63, + "acc_stderr,none": 0.048523658709390974 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5146871008939975, + "acc_stderr,none": 0.017872248024429188 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5228758169934641, + "acc_stderr,none": 0.028599936776089786 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.36879432624113473, + "acc_stderr,none": 0.028782227561347212 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4411764705882353, + "acc_stderr,none": 0.030161911930767053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.39156626506024095, + "acc_stderr,none": 0.037998574544796396 + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04434600701584929 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5404040404040404, + "acc_stderr,none": 0.03550702465131341 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5284974093264249, + "acc_stderr,none": 0.03602573571288441 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.4025641025641026, + "acc_stderr,none": 0.02486499515976774 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.47478991596638653, + "acc_stderr,none": 0.03243718055137406 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.5963302752293578, + "acc_stderr,none": 0.021035704856575025 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.5343511450381679, + "acc_stderr,none": 0.043749285605997376 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.41830065359477125, + "acc_stderr,none": 0.01995597514583561 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5181818181818182, + "acc_stderr,none": 0.04785964010794916 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.5224489795918368, + "acc_stderr,none": 0.03197694118713664 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.6069651741293532, + "acc_stderr,none": 0.03453682466031563 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145634 + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.4444444444444444, + "acc_stderr,none": 0.04292596718256977 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.4473684210526316, + "acc_stderr,none": 0.040463368839782535 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.04155319955593143 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.41, + "acc_stderr,none": 0.04943110704237104 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695233 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3333333333333333, + "acc_stderr,none": 0.04690650298201946 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.43829787234042555, + "acc_stderr,none": 0.03243618636108098 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5586206896551724, + "acc_stderr,none": 0.041379310344827586 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.35714285714285715, + "acc_stderr,none": 0.024677862841332786 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.5774193548387097, + "acc_stderr,none": 0.02810096472427264 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5024630541871922, + "acc_stderr,none": 0.035179450386910595 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.57, + "acc_stderr,none": 0.049756985195624305 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.2962962962962963, + "acc_stderr,none": 0.027840811495871948 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3443708609271523, + "acc_stderr,none": 0.038796870240733264 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4074074074074074, + "acc_stderr,none": 0.03350991604696042 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.41964285714285715, + "acc_stderr,none": 0.04684099321077107 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.4487252528129896, + "acc_stderr,none": 0.004107937147676099, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.39617428267800214, + "acc_stderr,none": 0.006959189662557386, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.49308014161570646, + "acc_stderr,none": 0.008833896183820066, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.49951251218719533, + "acc_stderr,none": 0.008909068747568758, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.43387250237868696, + "acc_stderr,none": 0.00868889021224809, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 45, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756216428.3241298, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10347658.30313968, + "end_time": 10348689.392188508, + "total_evaluation_time_seconds": "1031.0890488289297" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T07-41-27.257312.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T07-41-27.257312.json new file mode 100644 index 0000000000000000000000000000000000000000..90aa88e2cfc28efb7530042f2ff4642cb917b0ab --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T07-41-27.257312.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.6670293797606094, + "acc_stderr,none": 0.010995648822619247, + "acc_norm,none": 0.676278563656148, + "acc_norm_stderr,none": 0.01091676501070871 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756107643.2224786, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10239008.0114861, + "end_time": 10239110.695829116, + "total_evaluation_time_seconds": "102.68434301577508" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T16-26-06.052938.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T16-26-06.052938.json new file mode 100644 index 0000000000000000000000000000000000000000..d42cb7168963ae56f85249d26b847563ea08431a --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-25T16-26-06.052938.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.6670293797606094, + "acc_stderr,none": 0.010995648822619247, + "acc_norm,none": 0.676278563656148, + "acc_norm_stderr,none": 0.01091676501070871 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756139127.821693, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10270492.295024317, + "end_time": 10270589.491399057, + "total_evaluation_time_seconds": "97.19637474045157" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T14-43-06.736507.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T14-43-06.736507.json new file mode 100644 index 0000000000000000000000000000000000000000..64fcb166442c59ae45561a7ca1c1558f7a4d0611 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/piqa-0/.__models__/results_2025-08-26T14-43-06.736507.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.6670293797606094, + "acc_stderr,none": 0.010995648822619247, + "acc_norm,none": 0.676278563656148, + "acc_norm_stderr,none": 0.01091676501070871 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756219348.0897496, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10350713.484168192, + "end_time": 10350810.17502791, + "total_evaluation_time_seconds": "96.69085971824825" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T08-12-47.124048.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T08-12-47.124048.json new file mode 100644 index 0000000000000000000000000000000000000000..f6d6d3758bcf83cd5680255e1704ca7b7b705d8a --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T08-12-47.124048.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.1107333927775301, + "exact_match_stderr,remove_whitespace": 0.0023426519342683205 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756107789.5774505, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10239154.204092294, + "end_time": 10240990.562457988, + "total_evaluation_time_seconds": "1836.358365694061" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T16-57-48.267396.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T16-57-48.267396.json new file mode 100644 index 0000000000000000000000000000000000000000..0a98f2fed8d41fe1eca43173eb882f0b46079d9a --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-25T16-57-48.267396.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.1107333927775301, + "exact_match_stderr,remove_whitespace": 0.0023426519342683205 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756139268.3757875, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10270632.893698309, + "end_time": 10272491.705780433, + "total_evaluation_time_seconds": "1858.812082124874" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T15-15-30.843674.json b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T15-15-30.843674.json new file mode 100644 index 0000000000000000000000000000000000000000..fbc5ddd61aa26e08676c8f959e8267c7436cd1a4 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m-dc-b10/triviaqa-5/.__models__/results_2025-08-26T15-15-30.843674.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.1107333927775301, + "exact_match_stderr,remove_whitespace": 0.0023426519342683205 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "./models/", + "gguf_file": "qwen3-0.6b-base-q3_k_m-dc-b10.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=./models/,gguf_file=qwen3-0.6b-base-q3_k_m-dc-b10.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756219489.0325265, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "./models/", + "model_name_sanitized": ".__models__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10350853.829260955, + "end_time": 10352754.282117793, + "total_evaluation_time_seconds": "1900.4528568387032" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m/hellaswag-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-27-35.758622.json b/results/qwen3-0.6b-base-q3_k_m/hellaswag-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-27-35.758622.json new file mode 100644 index 0000000000000000000000000000000000000000..aaef2ce7ee091de14fa75273fb83f0673878c10a --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m/hellaswag-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-27-35.758622.json @@ -0,0 +1,133 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.37054371639115713, + "acc_stderr,none": 0.004819633668832263, + "acc_norm,none": 0.47868950408285205, + "acc_norm_stderr,none": 0.004985247260303788 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 0 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen3-0.6B-Base-GGUF,gguf_file=qwen3-0.6b-base-q3_k_m.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "aba409c6d9bebf587375fe5e6b859c71b0c405ac", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756185891.1523974, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen3-0.6B-Base-GGUF", + "model_name_sanitized": "skymizer__Qwen3-0.6B-Base-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10317255.442461815, + "end_time": 10317479.197210016, + "total_evaluation_time_seconds": "223.7547482009977" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m/hellaswag-10/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-56-14.750268.json b/results/qwen3-0.6b-base-q3_k_m/hellaswag-10/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-56-14.750268.json new file mode 100644 index 0000000000000000000000000000000000000000..d857b71bc03afdacb9bb25604c974dc2db12f0dd --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m/hellaswag-10/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-56-14.750268.json @@ -0,0 +1,132 @@ +{ + "results": { + "hellaswag": { + "alias": "hellaswag", + "acc,none": 0.36675960963951404, + "acc_stderr,none": 0.004809352075009149, + "acc_norm,none": 0.47341167098187614, + "acc_norm_stderr,none": 0.004982721472407312 + } + }, + "group_subtasks": { + "hellaswag": [] + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "hellaswag": 1.0 + }, + "n-shot": { + "hellaswag": 10 + }, + "higher_is_better": { + "hellaswag": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "hellaswag": { + "original": 10042, + "effective": 10042 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen3-0.6B-Base-GGUF,gguf_file=qwen3-0.6b-base-q3_k_m.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "aba409c6d9bebf587375fe5e6b859c71b0c405ac", + "batch_size": "auto:4", + "batch_sizes": [ + 32, + 32, + 32, + 32 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756186158.4939308, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen3-0.6B-Base-GGUF", + "model_name_sanitized": "skymizer__Qwen3-0.6B-Base-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10317522.784737589, + "end_time": 10319198.188935088, + "total_evaluation_time_seconds": "1675.404197499156" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m/mmlu-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-23-08.094936.json b/results/qwen3-0.6b-base-q3_k_m/mmlu-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-23-08.094936.json new file mode 100644 index 0000000000000000000000000000000000000000..23da21319539b7e09f71cee3eedabec1f3486004 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m/mmlu-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-23-08.094936.json @@ -0,0 +1,3522 @@ +{ + "results": { + "mmlu": { + "acc,none": 0.44630394530693634, + "acc_stderr,none": 0.00410894511075942, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.3951115834218916, + "acc_stderr,none": 0.0069614380832707925, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.3412698412698413, + "acc_stderr,none": 0.04240799327574919 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.5636363636363636, + "acc_stderr,none": 0.03872592983524754 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.46568627450980393, + "acc_stderr,none": 0.03501038327635894 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.5485232067510548, + "acc_stderr,none": 0.032393600173974677 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.6528925619834711, + "acc_stderr,none": 0.043457245702925314 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.4537037037037037, + "acc_stderr,none": 0.04812917324536823 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.4233128834355828, + "acc_stderr,none": 0.038818912133343826 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.48554913294797686, + "acc_stderr,none": 0.026907849856282567 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.26033519553072626, + "acc_stderr,none": 0.014676252009319574 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.5048231511254019, + "acc_stderr,none": 0.028396770444111305 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.4660493827160494, + "acc_stderr,none": 0.02775653525734759 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.3272490221642764, + "acc_stderr,none": 0.011983819806464752 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.5263157894736842, + "acc_stderr,none": 0.03829509868994731 + }, + "mmlu_other": { + "acc,none": 0.482780817508851, + "acc_stderr,none": 0.008836624709930617, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.44, + "acc_stderr,none": 0.049888765156985884 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.49056603773584906, + "acc_stderr,none": 0.03076739470780806 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.45664739884393063, + "acc_stderr,none": 0.03798106566014504 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.3, + "acc_stderr,none": 0.04605661864718382 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.4304932735426009, + "acc_stderr,none": 0.03323197302942939 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.5145631067961165, + "acc_stderr,none": 0.049486373240266356 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.7136752136752137, + "acc_stderr,none": 0.029614323690456627 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.5057471264367817, + "acc_stderr,none": 0.017878782326129335 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.5261437908496732, + "acc_stderr,none": 0.028590752958852387 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.35106382978723405, + "acc_stderr,none": 0.028473501272963837 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.4411764705882353, + "acc_stderr,none": 0.030161911930767053 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.4036144578313253, + "acc_stderr,none": 0.03819486140758399 + }, + "mmlu_social_sciences": { + "acc,none": 0.4991875203119922, + "acc_stderr,none": 0.008904059097607726, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.3508771929824561, + "acc_stderr,none": 0.044895393502707 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.5454545454545454, + "acc_stderr,none": 0.035476014940069356 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.5129533678756477, + "acc_stderr,none": 0.03607228061047752 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.40512820512820513, + "acc_stderr,none": 0.024890471769938145 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.47478991596638653, + "acc_stderr,none": 0.03243718055137406 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.6018348623853211, + "acc_stderr,none": 0.020987989422654237 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.5343511450381679, + "acc_stderr,none": 0.043749285605997376 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.4133986928104575, + "acc_stderr,none": 0.01992211568278667 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.5, + "acc_stderr,none": 0.04789131426105757 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.5102040816326531, + "acc_stderr,none": 0.032002553478937754 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.6119402985074627, + "acc_stderr,none": 0.03445789964362745 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.64, + "acc_stderr,none": 0.048241815132442176 + }, + "mmlu_stem": { + "acc,none": 0.43514113542657784, + "acc_stderr,none": 0.008696536622059903, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.31, + "acc_stderr,none": 0.04648231987117317 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.43703703703703706, + "acc_stderr,none": 0.042849586397534056 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.4342105263157895, + "acc_stderr,none": 0.04033565667848322 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.5416666666666666, + "acc_stderr,none": 0.041666666666666664 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.4, + "acc_stderr,none": 0.0492365963917331 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001973 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.33, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3627450980392157, + "acc_stderr,none": 0.047840607041056527 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.59, + "acc_stderr,none": 0.04943110704237104 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.4297872340425532, + "acc_stderr,none": 0.03236214467715557 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5586206896551724, + "acc_stderr,none": 0.041379310344827586 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.3544973544973545, + "acc_stderr,none": 0.024636830602842035 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.5774193548387097, + "acc_stderr,none": 0.02810096472427264 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.5073891625615764, + "acc_stderr,none": 0.03517603540361012 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.04960449637488582 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.3037037037037037, + "acc_stderr,none": 0.02803792996911503 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.3443708609271523, + "acc_stderr,none": 0.038796870240733264 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.4351851851851852, + "acc_stderr,none": 0.03381200005643526 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.4107142857142857, + "acc_stderr,none": 0.04669510663875191 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.44630394530693634, + "acc_stderr,none": 0.00410894511075942, + "alias": "mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.3951115834218916, + "acc_stderr,none": 0.0069614380832707925, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.482780817508851, + "acc_stderr,none": 0.008836624709930617, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.4991875203119922, + "acc_stderr,none": 0.008904059097607726, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.43514113542657784, + "acc_stderr,none": 0.008696536622059903, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_prehistory", + "mmlu_philosophy", + "mmlu_moral_scenarios", + "mmlu_moral_disputes", + "mmlu_logical_fallacies", + "mmlu_jurisprudence", + "mmlu_international_law", + "mmlu_high_school_world_history", + "mmlu_high_school_us_history", + "mmlu_high_school_european_history", + "mmlu_formal_logic" + ], + "mmlu_social_sciences": [ + "mmlu_us_foreign_policy", + "mmlu_sociology", + "mmlu_security_studies", + "mmlu_public_relations", + "mmlu_professional_psychology", + "mmlu_human_sexuality", + "mmlu_high_school_psychology", + "mmlu_high_school_microeconomics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_geography", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_virology", + "mmlu_professional_medicine", + "mmlu_professional_accounting", + "mmlu_nutrition", + "mmlu_miscellaneous", + "mmlu_medical_genetics", + "mmlu_marketing", + "mmlu_management", + "mmlu_human_aging", + "mmlu_global_facts", + "mmlu_college_medicine", + "mmlu_clinical_knowledge", + "mmlu_business_ethics" + ], + "mmlu_stem": [ + "mmlu_machine_learning", + "mmlu_high_school_statistics", + "mmlu_high_school_physics", + "mmlu_high_school_mathematics", + "mmlu_high_school_computer_science", + "mmlu_high_school_chemistry", + "mmlu_high_school_biology", + "mmlu_elementary_mathematics", + "mmlu_electrical_engineering", + "mmlu_conceptual_physics", + "mmlu_computer_security", + "mmlu_college_physics", + "mmlu_college_mathematics", + "mmlu_college_computer_science", + "mmlu_college_chemistry", + "mmlu_college_biology", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_abstract_algebra" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ] + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0 + }, + "n-shot": { + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "higher_is_better": { + "mmlu": { + "acc": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + } + }, + "n-samples": { + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen3-0.6B-Base-GGUF,gguf_file=qwen3-0.6b-base-q3_k_m.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "aba409c6d9bebf587375fe5e6b859c71b0c405ac", + "batch_size": "auto:4", + "batch_sizes": [ + 13, + 45, + 51, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756184988.180219, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen3-0.6B-Base-GGUF", + "model_name_sanitized": "skymizer__Qwen3-0.6B-Base-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10316353.375672482, + "end_time": 10317211.533078732, + "total_evaluation_time_seconds": "858.1574062500149" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m/piqa-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-58-46.511874.json b/results/qwen3-0.6b-base-q3_k_m/piqa-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-58-46.511874.json new file mode 100644 index 0000000000000000000000000000000000000000..3e259fd34bb4a2e5e928e22983df86427f9e99fd --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m/piqa-0/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T05-58-46.511874.json @@ -0,0 +1,130 @@ +{ + "results": { + "piqa": { + "alias": "piqa", + "acc,none": 0.6697497279651795, + "acc_stderr,none": 0.010972947133006216, + "acc_norm,none": 0.6751904243743199, + "acc_norm_stderr,none": 0.010926296238294114 + } + }, + "group_subtasks": { + "piqa": [] + }, + "configs": { + "piqa": { + "task": "piqa", + "dataset_path": "baber/piqa", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{goal}}\nAnswer:", + "doc_to_target": "label", + "unsafe_code": false, + "doc_to_choice": "{{[sol1, sol2]}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "goal", + "metadata": { + "version": 1.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "piqa": 1.0 + }, + "n-shot": { + "piqa": 0 + }, + "higher_is_better": { + "piqa": { + "acc": true, + "acc_norm": true + } + }, + "n-samples": { + "piqa": { + "original": 1838, + "effective": 1838 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen3-0.6B-Base-GGUF,gguf_file=qwen3-0.6b-base-q3_k_m.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "aba409c6d9bebf587375fe5e6b859c71b0c405ac", + "batch_size": "auto:4", + "batch_sizes": [ + 64, + 64, + 64, + 64, + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756187884.5116744, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen3-0.6B-Base-GGUF", + "model_name_sanitized": "skymizer__Qwen3-0.6B-Base-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10319245.79722277, + "end_time": 10319349.949869724, + "total_evaluation_time_seconds": "104.15264695324004" +} \ No newline at end of file diff --git a/results/qwen3-0.6b-base-q3_k_m/triviaqa-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T06-28-55.444491.json b/results/qwen3-0.6b-base-q3_k_m/triviaqa-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T06-28-55.444491.json new file mode 100644 index 0000000000000000000000000000000000000000..5fdb21c6bd49364bb5c2ac6d4beab8f8994b5614 --- /dev/null +++ b/results/qwen3-0.6b-base-q3_k_m/triviaqa-5/skymizer__Qwen3-0.6B-Base-GGUF/results_2025-08-26T06-28-55.444491.json @@ -0,0 +1,137 @@ +{ + "results": { + "triviaqa": { + "alias": "triviaqa", + "exact_match,remove_whitespace": 0.1148573339277753, + "exact_match_stderr,remove_whitespace": 0.00238033714993835 + } + }, + "group_subtasks": { + "triviaqa": [] + }, + "configs": { + "triviaqa": { + "task": "triviaqa", + "dataset_path": "trivia_qa", + "dataset_name": "rc.nocontext", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "Question: {{question}}?\nAnswer:", + "doc_to_target": "{{answer.aliases}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n", + ".", + "," + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "remove_whitespace", + "filter": [ + { + "function": "remove_whitespace" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "skymizer/Qwen3-0.6B-Base-GGUF", + "gguf_file": "qwen3-0.6b-base-q3_k_m.gguf", + "tokenizer": "Qwen/Qwen3-0.6B-Base" + } + } + }, + "versions": { + "triviaqa": 3.0 + }, + "n-shot": { + "triviaqa": 5 + }, + "higher_is_better": { + "triviaqa": { + "exact_match": true + } + }, + "n-samples": { + "triviaqa": { + "original": 17944, + "effective": 17944 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=skymizer/Qwen3-0.6B-Base-GGUF,gguf_file=qwen3-0.6b-base-q3_k_m.gguf,tokenizer=Qwen/Qwen3-0.6B-Base", + "model_num_parameters": 596049920, + "model_dtype": "torch.float32", + "model_revision": "main", + "model_sha": "aba409c6d9bebf587375fe5e6b859c71b0c405ac", + "batch_size": "auto:4", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "v0.1.0", + "date": 1756188028.0792212, + "pretty_env_info": "'NoneType' object has no attribute 'splitlines'", + "transformers_version": "4.55.4", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_eos_token": [ + "<|endoftext|>", + "151643" + ], + "tokenizer_bos_token": [ + null, + "None" + ], + "eot_token_id": 151643, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "skymizer/Qwen3-0.6B-Base-GGUF", + "model_name_sanitized": "skymizer__Qwen3-0.6B-Base-GGUF", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 10319393.040331744, + "end_time": 10321158.883148294, + "total_evaluation_time_seconds": "1765.8428165502846" +} \ No newline at end of file