OpenCompass/configs/summarizers/chat_OC15.py
2024-05-14 15:35:58 +08:00

82 lines
2.4 KiB
Python

from mmengine.config import read_base
with read_base():
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.lcbench import lcbench_summary_groups
other_summary_groups = [
{
'name': 'average',
'subsets': [
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['average', 'naive_average'],
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'ceval',
'ceval-stem',
'ceval-social-science',
'ceval-humanities',
'ceval-other',
'ceval-hard',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)