[Feature] Support MMLU-CF Benchmark (#1775)

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* Update mmlu-cf

* Update mmlu-cf

* Update mmlu-cf

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* [Feature] Support MMLU-CF Benchmark

* Remove outside configs

---------

Co-authored-by: liushz <qq1791167085@163.com>
This commit is contained in:
Zhao Qihao 2025-01-09 14:11:20 +08:00 committed by GitHub
parent f1e50d4bf0
commit e039f3efa0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 352 additions and 0 deletions

38
configs/eval_mmlu_cf.py Normal file
View File

@ -0,0 +1,38 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import mmlu_cf_datasets
from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import models as hf_qwen2_5_7b_instruct_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
from opencompass.configs.summarizers.mmlu_cf import summarizer
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
from opencompass.runners import LocalRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=8,
task=dict(type=OpenICLInferTask)
),
)
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=OpenICLEvalTask)
),
)
work_dir = 'outputs/debug/mmlu_cf'

View File

@ -0,0 +1,5 @@
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']
mmlu_cf_summary_groups = [
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
]

View File

@ -0,0 +1,25 @@
from mmengine.config import read_base
with read_base():
from .groups.mmlu_cf import mmlu_cf_summary_groups
summarizer = dict(
dataset_abbrs=[
'mmlu_cf_Biology',
'mmlu_cf_Business',
'mmlu_cf_Chemistry',
'mmlu_cf_Computer_Science',
'mmlu_cf_Economics',
'mmlu_cf_Engineering',
'mmlu_cf_Health',
'mmlu_cf_History',
'mmlu_cf_Law',
'mmlu_cf_Math',
'mmlu_cf_Philosophy',
'mmlu_cf_Physics',
'mmlu_cf_Psychology',
'mmlu_cf_Other',
'mmlu_cf',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,16 @@
categories = [
'Math',
'Physics',
'Chemistry',
'Law',
'Engineering',
'Other',
'Economics',
'Health',
'Psychology',
'Business',
'Biology',
'Philosophy',
'Computer_Science',
'History',
]

View File

@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mmlu_cf_categories import categories
mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403

View File

@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mmlu_cf_categories import categories
mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,64 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMLUCFDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
with read_base():
from .mmlu_cf_categories import categories
mmlu_cf_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
mmlu_cf_datasets = []
for _name in categories:
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
mmlu_cf_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmlu_cf_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmlu_cf_datasets.append(
dict(
abbr=f'mmlu_cf_{_name}',
type=MMLUCFDataset,
path='microsoft/MMLU-CF',
name=_name,
reader_cfg=mmlu_cf_reader_cfg,
infer_cfg=mmlu_cf_infer_cfg,
eval_cfg=mmlu_cf_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,5 @@
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']
mmlu_cf_summary_groups = [
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
]

View File

@ -0,0 +1,25 @@
from mmengine.config import read_base
with read_base():
from .groups.mmlu_cf import mmlu_cf_summary_groups
summarizer = dict(
dataset_abbrs=[
'mmlu_cf_Biology',
'mmlu_cf_Business',
'mmlu_cf_Chemistry',
'mmlu_cf_Computer_Science',
'mmlu_cf_Economics',
'mmlu_cf_Engineering',
'mmlu_cf_Health',
'mmlu_cf_History',
'mmlu_cf_Law',
'mmlu_cf_Math',
'mmlu_cf_Philosophy',
'mmlu_cf_Physics',
'mmlu_cf_Psychology',
'mmlu_cf_Other',
'mmlu_cf',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -92,6 +92,7 @@ from .mbpp import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403
from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403
from .mmlu_cf import * # noqa: F401, F403
from .mmlu_pro import * # noqa: F401, F403
from .MMLUArabic import * # noqa: F401, F403
from .mmmlu import * # noqa: F401, F403

View File

@ -0,0 +1,41 @@
from datasets import DatasetDict, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMLUCFDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
"""Loading HuggingFace datasets."""
# Use HuggingFace's load_dataset method to load the dataset
hf_dataset = load_dataset(path)
columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer']
hf_dataset = hf_dataset.map(
lambda x: {key: x[key]
for key in columns_to_keep})
splits = ['dev', 'val']
for split in splits:
sub_set = f'{name}_{split}'
# Rename fields here if they don't match the expected names
hf_dataset[sub_set] = hf_dataset[sub_set].map(
lambda example: {
'input': example['Question'],
'A': example['A'],
'B': example['B'],
'C': example['C'],
'D': example['D'],
'target': example['Answer']
})
# Create a DatasetDict and return it
dataset = DatasetDict({
'dev': hf_dataset[f'{name}_{splits[0]}'],
'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
})
return dataset