OpenCompass/opencompass/configs/summarizers/compassbench_v1_3_objective.py

205 lines
7.2 KiB
Python
Raw Normal View History

from mmengine.config import read_base
with read_base():
from .groups.legacy.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups
obj_summary_groups = [
########################## knowledge ##########################
{
'name': 'knowledge_en',
'subsets': [
['compassbench-wiki_en_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-理科_circular', 'perf_4'],
],
},
{
'name': 'knowledge_cn',
'subsets': [
['compassbench-wiki_zh_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-理科_circular', 'perf_4'],
],
},
{
'name': 'knowledge',
'subsets': [
['compassbench-wiki_en_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-理科_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-理科_circular', 'perf_4'],
],
},
########################## math ##########################
{
'name': 'math_en',
'subsets': [
['compassbench-college_single_choice_en_circular', 'perf_4'],
['compassbench-arithmetic_cloze_en', 'accuracy'],
],
},
{
'name': 'math_cn',
'subsets': [
['compassbench-college_single_choice_cn_circular', 'perf_4'],
['compassbench-arithmetic_cloze_en', 'accuracy'],
],
},
{
'name': 'math',
'subsets': [
['compassbench-college_single_choice_cn_circular', 'perf_4'],
['compassbench-college_single_choice_en_circular', 'perf_4'],
['compassbench-arithmetic_cloze_en', 'accuracy'],
],
},
########################## code ##########################
{
'name': 'code-completion_en',
'subsets': [
['compass_bench_cdoe_completion_en', 'humaneval_plus_pass@1'],
],
},
{
'name': 'code-completion_cn',
'subsets': [
['compass_bench_cdoe_completion_zh', 'humaneval_pass@1'],
],
},
{
'name': 'code-interview_en',
'subsets': [
['compass_bench_code_interview_en-EASY', 'pass@1'],
['compass_bench_code_interview_en-MEDIUM', 'pass@1'],
['compass_bench_code_interview_en-HARD', 'pass@1'],
],
},
{
'name': 'code-interview_cn',
'subsets': [
['compass_bench_code_interview_zh-EASY', 'pass@1'],
['compass_bench_code_interview_zh-MEDIUM', 'pass@1'],
['compass_bench_code_interview_zh-HARD', 'pass@1'],
],
},
{
'name': 'code-competition',
'subsets': [
['TACO-EASY', 'pass@1'],
['TACO-MEDIUM', 'pass@1'],
['TACO-MEDIUM_HARD', 'pass@1'],
['TACO-HARD', 'pass@1'],
['TACO-VERY_HARD', 'pass@1'],
],
},
{
'name': 'code_cn',
'subsets': [
['code-completion_cn', 'naive_average'],
['code-interview_cn', 'naive_average'],
],
},
{
'name': 'code_en',
'subsets': [
['code-completion_en', 'naive_average'],
['code-interview_en', 'naive_average'],
['code-competition', 'naive_average'],
],
},
{
'name': 'code',
'subsets': [
['code-completion_cn', 'naive_average'],
['code-interview_cn', 'naive_average'],
['code-completion_en', 'naive_average'],
['code-interview_en', 'naive_average'],
['code-competition', 'naive_average'],
],
},
]
agent_summary_groups = [
dict(
name='cibench_template',
subsets=[
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
],
),
dict(
name='cibench_template_cn',
subsets=[
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
],
),
# dict(
# name='agent_cn',
# subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh'],
# ),
# dict(
# name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']
# ),
dict(
name='agent_cn',
subsets=['plugin_eval-mus-p10_one_review_zh'],
),
dict(
name='agent_en', subsets=['plugin_eval-mus-p10_one_review']
),
dict(name='agent', subsets=['agent_cn', 'agent_en']),
]
summarizer = dict(
dataset_abbrs=[
# ["average", "naive_average"],
# "",
['knowledge', 'naive_average'],
['knowledge_en','naive_average'],
['knowledge_cn','naive_average'],
['compassbench-wiki_en_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_en_sub_500_自然科学-理科_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_人文科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_社会科学_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-工科_circular', 'perf_4'],
['compassbench-wiki_zh_sub_500_自然科学-理科_circular', 'perf_4'],
'',
['math', 'naive_average'],
['math_en', 'naive_average'],
['math_cn', 'naive_average'],
['compassbench-college_single_choice_cn_circular', 'perf_4'],
['compassbench-college_single_choice_en_circular', 'perf_4'],
['compassbench-arithmetic_cloze_en', 'accuracy'],
'',
['code', 'naive_average'],
['code_cn', 'naive_average'],
['code_en', 'naive_average'],
['code-completion_cn', 'naive_average'],
['code-completion_en', 'naive_average'],
['code-interview_cn', 'naive_average'],
['code-interview_en', 'naive_average'],
['code-competition', 'naive_average'],
'',
['agent', 'naive_average'],
['agent_en', 'naive_average'],
['agent_cn', 'naive_average'],
['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
['plugin_eval-mus-p10_one_review', 'naive_average'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)