| | from mmengine.config import read_base |
| |
|
| | from opencompass.models import (HuggingFacewithChatTemplate, |
| | TurboMindModelwithChatTemplate) |
| | from opencompass.summarizers import DefaultSubjectiveSummarizer |
| | from opencompass.utils.text_postprocessors import extract_non_reasoning_content |
| |
|
| | with read_base(): |
| | |
| | |
| | from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \ |
| | csimpleqa_datasets |
| | from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \ |
| | simpleqa_datasets |
| | from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \ |
| | alignbench_datasets |
| | from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \ |
| | alpacav2_datasets |
| | from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \ |
| | arenahard_datasets |
| | from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \ |
| | compassarena_datasets |
| | from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \ |
| | followbench_llmeval_datasets |
| | from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \ |
| | mtbench101_datasets |
| | from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \ |
| | wildbench_datasets |
| |
|
| | from ...rjob import infer, sub_eval |
| |
|
| | datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') |
| | and 'mtbench101' not in k and 'wildbench' not in k), []) |
| | datasets += mtbench101_datasets |
| | datasets += wildbench_datasets |
| |
|
| | api_meta_template = dict( |
| | round=[ |
| | dict(role='HUMAN', api_role='HUMAN'), |
| | dict(role='BOT', api_role='BOT', generate=True), |
| | ], |
| | reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], |
| | ) |
| |
|
| | hf_model = dict(type=HuggingFacewithChatTemplate, |
| | abbr='qwen-3-8b-hf-fullbench', |
| | path='Qwen/Qwen3-8B', |
| | max_out_len=8192, |
| | batch_size=8, |
| | run_cfg=dict(num_gpus=1), |
| | pred_postprocessor=dict(type=extract_non_reasoning_content)) |
| |
|
| | tm_model = dict(type=TurboMindModelwithChatTemplate, |
| | abbr='qwen-3-8b-fullbench', |
| | path='Qwen/Qwen3-8B', |
| | engine_config=dict(session_len=32768, max_batch_size=1, tp=1), |
| | gen_config=dict(do_sample=False, enable_thinking=True), |
| | max_seq_len=32768, |
| | max_out_len=32768, |
| | batch_size=1, |
| | run_cfg=dict(num_gpus=1), |
| | pred_postprocessor=dict(type=extract_non_reasoning_content)) |
| |
|
| | models = [hf_model, tm_model] |
| |
|
| | judge_models = [ |
| | dict(type=TurboMindModelwithChatTemplate, |
| | abbr='qwen-3-8b-judger', |
| | path='Qwen/Qwen3-8B', |
| | engine_config=dict(session_len=46000, max_batch_size=1, tp=1), |
| | gen_config=dict(do_sample=False, enable_thinking=False), |
| | max_seq_len=46000, |
| | max_out_len=46000, |
| | batch_size=1, |
| | run_cfg=dict(num_gpus=1), |
| | pred_postprocessor=dict(type=extract_non_reasoning_content)) |
| | ] |
| |
|
| | sub_eval['partitioner']['judge_models'] = judge_models |
| | eval = sub_eval |
| |
|
| | summary_groups = [] |
| | summary_groups.append({ |
| | 'name': 'compassarena_language', |
| | 'subsets': [ |
| | ['compassarena_language', '内容总结'], |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': 'compassarena_knowledge', |
| | 'subsets': [ |
| | ['compassarena_knowledge', '生活常识_ZH'], |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': 'compassarena_reason_v2', |
| | 'subsets': [ |
| | ['compassarena_reason_v2', 'reasoning'], |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': 'compassarena_math_v2', |
| | 'subsets': [ |
| | ['compassarena_math_v2', '高等数学_ZH'], |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': 'compassarena_creationv2_zh', |
| | 'subsets': [ |
| | ['compassarena_creationv2_zh', '内容扩写_ZH'], |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': |
| | 'CompassArena', |
| | 'subsets': [ |
| | 'compassarena_language', |
| | 'compassarena_knowledge', |
| | 'compassarena_reason_v2', |
| | 'compassarena_math_v2', |
| | 'compassarena_creationv2_zh', |
| | ], |
| | }) |
| | summary_groups.append({ |
| | 'name': |
| | 'FoFo', |
| | 'subsets': [['fofo_test_prompts', 'overall'], |
| | ['fofo_test_prompts_cn', 'overall']], |
| | }) |
| | summary_groups.append({ |
| | 'name': |
| | 'Followbench', |
| | 'subsets': [ |
| | ['followbench_llmeval_en', 'HSR_AVG'], |
| | ['followbench_llmeval_en', 'SSR_AVG'], |
| | ], |
| | }) |
| |
|
| | |
| | summarizer = dict( |
| | dataset_abbrs=[ |
| | ['alignment_bench_v1_1', '总分'], |
| | ['alpaca_eval', 'total'], |
| | ['arenahard', 'score'], |
| | ['Followbench', 'naive_average'], |
| | ['CompassArena', 'naive_average'], |
| | ['FoFo', 'naive_average'], |
| | ['mtbench101', 'avg'], |
| | ['wildbench', 'average'], |
| | ['simpleqa', 'accuracy_given_attempted'], |
| | ['chinese_simpleqa', 'given_attempted_accuracy'], |
| | '', |
| | ['alignment_bench_v1_1', '专业能力'], |
| | ['alignment_bench_v1_1', '数学计算'], |
| | ['alignment_bench_v1_1', '基本任务'], |
| | ['alignment_bench_v1_1', '逻辑推理'], |
| | ['alignment_bench_v1_1', '中文理解'], |
| | ['alignment_bench_v1_1', '文本写作'], |
| | ['alignment_bench_v1_1', '角色扮演'], |
| | ['alignment_bench_v1_1', '综合问答'], |
| | ['alpaca_eval', 'helpful_base'], |
| | ['alpaca_eval', 'koala'], |
| | ['alpaca_eval', 'oasst'], |
| | ['alpaca_eval', 'selfinstruct'], |
| | ['alpaca_eval', 'vicuna'], |
| | ['compassarena_language', 'naive_average'], |
| | ['compassarena_knowledge', 'naive_average'], |
| | ['compassarena_reason_v2', 'naive_average'], |
| | ['compassarena_math_v2', 'naive_average'], |
| | ['compassarena_creationv2_zh', 'naive_average'], |
| | ['fofo_test_prompts', 'overall'], |
| | ['fofo_test_prompts_cn', 'overall'], |
| | ['followbench_llmeval_en', 'HSR_AVG'], |
| | ['followbench_llmeval_en', 'SSR_AVG'], |
| | ['followbench_llmeval_en', 'HSR_L1'], |
| | ['followbench_llmeval_en', 'HSR_L2'], |
| | ['followbench_llmeval_en', 'HSR_L3'], |
| | ['followbench_llmeval_en', 'HSR_L4'], |
| | ['followbench_llmeval_en', 'HSR_L5'], |
| | ['followbench_llmeval_en', 'SSR_L1'], |
| | ['followbench_llmeval_en', 'SSR_L2'], |
| | ['followbench_llmeval_en', 'SSR_L3'], |
| | ['followbench_llmeval_en', 'SSR_L4'], |
| | ['followbench_llmeval_en', 'SSR_L5'], |
| | ['simpleqa', 'f1'], |
| | ], |
| | type=DefaultSubjectiveSummarizer, |
| | summary_groups=summary_groups, |
| | ) |
| |
|