mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Support MMLU-CF Benchmark (#1775)
* [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * Update mmlu-cf * Update mmlu-cf * Update mmlu-cf * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * [Feature] Support MMLU-CF Benchmark * Remove outside configs --------- Co-authored-by: liushz <qq1791167085@163.com>
This commit is contained in:
parent
f1e50d4bf0
commit
e039f3efa0
38
configs/eval_mmlu_cf.py
Normal file
38
configs/eval_mmlu_cf.py
Normal file
@ -0,0 +1,38 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import mmlu_cf_datasets
|
||||
|
||||
from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import models as hf_qwen2_5_7b_instruct_model
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
|
||||
|
||||
from opencompass.configs.summarizers.mmlu_cf import summarizer
|
||||
|
||||
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=8,
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLEvalTask)
|
||||
),
|
||||
)
|
||||
|
||||
work_dir = 'outputs/debug/mmlu_cf'
|
5
configs/summarizers/groups/mmlu_cf.py
Normal file
5
configs/summarizers/groups/mmlu_cf.py
Normal file
@ -0,0 +1,5 @@
|
||||
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']
|
||||
|
||||
mmlu_cf_summary_groups = [
|
||||
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
|
||||
]
|
25
configs/summarizers/mmlu_cf.py
Normal file
25
configs/summarizers/mmlu_cf.py
Normal file
@ -0,0 +1,25 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .groups.mmlu_cf import mmlu_cf_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'mmlu_cf_Biology',
|
||||
'mmlu_cf_Business',
|
||||
'mmlu_cf_Chemistry',
|
||||
'mmlu_cf_Computer_Science',
|
||||
'mmlu_cf_Economics',
|
||||
'mmlu_cf_Engineering',
|
||||
'mmlu_cf_Health',
|
||||
'mmlu_cf_History',
|
||||
'mmlu_cf_Law',
|
||||
'mmlu_cf_Math',
|
||||
'mmlu_cf_Philosophy',
|
||||
'mmlu_cf_Physics',
|
||||
'mmlu_cf_Psychology',
|
||||
'mmlu_cf_Other',
|
||||
'mmlu_cf',
|
||||
],
|
||||
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
16
opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py
Normal file
16
opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py
Normal file
@ -0,0 +1,16 @@
|
||||
categories = [
|
||||
'Math',
|
||||
'Physics',
|
||||
'Chemistry',
|
||||
'Law',
|
||||
'Engineering',
|
||||
'Other',
|
||||
'Economics',
|
||||
'Health',
|
||||
'Psychology',
|
||||
'Business',
|
||||
'Biology',
|
||||
'Philosophy',
|
||||
'Computer_Science',
|
||||
'History',
|
||||
]
|
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py
Normal file
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py
Normal file
@ -0,0 +1,64 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import MMLUCFDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mmlu_cf_categories import categories
|
||||
|
||||
mmlu_cf_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
mmlu_cf_datasets = []
|
||||
for _name in categories:
|
||||
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
|
||||
mmlu_cf_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
mmlu_cf_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
||||
|
||||
mmlu_cf_datasets.append(
|
||||
dict(
|
||||
abbr=f'mmlu_cf_{_name}',
|
||||
type=MMLUCFDataset,
|
||||
path='microsoft/MMLU-CF',
|
||||
name=_name,
|
||||
reader_cfg=mmlu_cf_reader_cfg,
|
||||
infer_cfg=mmlu_cf_infer_cfg,
|
||||
eval_cfg=mmlu_cf_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
4
opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
Normal file
4
opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403
|
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py
Normal file
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py
Normal file
@ -0,0 +1,64 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import MMLUCFDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mmlu_cf_categories import categories
|
||||
|
||||
mmlu_cf_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
mmlu_cf_datasets = []
|
||||
for _name in categories:
|
||||
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
|
||||
mmlu_cf_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
mmlu_cf_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
||||
|
||||
mmlu_cf_datasets.append(
|
||||
dict(
|
||||
abbr=f'mmlu_cf_{_name}',
|
||||
type=MMLUCFDataset,
|
||||
path='microsoft/MMLU-CF',
|
||||
name=_name,
|
||||
reader_cfg=mmlu_cf_reader_cfg,
|
||||
infer_cfg=mmlu_cf_infer_cfg,
|
||||
eval_cfg=mmlu_cf_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py
Normal file
64
opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py
Normal file
@ -0,0 +1,64 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import MMLUCFDataset
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mmlu_cf_categories import categories
|
||||
|
||||
mmlu_cf_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
|
||||
mmlu_cf_datasets = []
|
||||
for _name in categories:
|
||||
_hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.'
|
||||
mmlu_cf_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=
|
||||
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
dict(role='BOT', prompt='{target}\n')
|
||||
]),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin='</E>',
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
||||
),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
mmlu_cf_eval_cfg = dict(
|
||||
evaluator=dict(type=AccwithDetailsEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
||||
|
||||
mmlu_cf_datasets.append(
|
||||
dict(
|
||||
abbr=f'mmlu_cf_{_name}',
|
||||
type=MMLUCFDataset,
|
||||
path='microsoft/MMLU-CF',
|
||||
name=_name,
|
||||
reader_cfg=mmlu_cf_reader_cfg,
|
||||
infer_cfg=mmlu_cf_infer_cfg,
|
||||
eval_cfg=mmlu_cf_eval_cfg,
|
||||
))
|
||||
|
||||
del _name, _hint
|
5
opencompass/configs/summarizers/groups/mmlu_cf.py
Normal file
5
opencompass/configs/summarizers/groups/mmlu_cf.py
Normal file
@ -0,0 +1,5 @@
|
||||
categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History']
|
||||
|
||||
mmlu_cf_summary_groups = [
|
||||
{'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]},
|
||||
]
|
25
opencompass/configs/summarizers/mmlu_cf.py
Normal file
25
opencompass/configs/summarizers/mmlu_cf.py
Normal file
@ -0,0 +1,25 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .groups.mmlu_cf import mmlu_cf_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'mmlu_cf_Biology',
|
||||
'mmlu_cf_Business',
|
||||
'mmlu_cf_Chemistry',
|
||||
'mmlu_cf_Computer_Science',
|
||||
'mmlu_cf_Economics',
|
||||
'mmlu_cf_Engineering',
|
||||
'mmlu_cf_Health',
|
||||
'mmlu_cf_History',
|
||||
'mmlu_cf_Law',
|
||||
'mmlu_cf_Math',
|
||||
'mmlu_cf_Philosophy',
|
||||
'mmlu_cf_Physics',
|
||||
'mmlu_cf_Psychology',
|
||||
'mmlu_cf_Other',
|
||||
'mmlu_cf',
|
||||
],
|
||||
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
@ -92,6 +92,7 @@ from .mbpp import * # noqa: F401, F403
|
||||
from .medbench import * # noqa: F401, F403
|
||||
from .mgsm import * # noqa: F401, F403
|
||||
from .mmlu import * # noqa: F401, F403
|
||||
from .mmlu_cf import * # noqa: F401, F403
|
||||
from .mmlu_pro import * # noqa: F401, F403
|
||||
from .MMLUArabic import * # noqa: F401, F403
|
||||
from .mmmlu import * # noqa: F401, F403
|
||||
|
41
opencompass/datasets/mmlu_cf.py
Normal file
41
opencompass/datasets/mmlu_cf.py
Normal file
@ -0,0 +1,41 @@
|
||||
from datasets import DatasetDict, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class MMLUCFDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, name: str):
|
||||
"""Loading HuggingFace datasets."""
|
||||
# Use HuggingFace's load_dataset method to load the dataset
|
||||
hf_dataset = load_dataset(path)
|
||||
columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer']
|
||||
hf_dataset = hf_dataset.map(
|
||||
lambda x: {key: x[key]
|
||||
for key in columns_to_keep})
|
||||
splits = ['dev', 'val']
|
||||
|
||||
for split in splits:
|
||||
sub_set = f'{name}_{split}'
|
||||
|
||||
# Rename fields here if they don't match the expected names
|
||||
hf_dataset[sub_set] = hf_dataset[sub_set].map(
|
||||
lambda example: {
|
||||
'input': example['Question'],
|
||||
'A': example['A'],
|
||||
'B': example['B'],
|
||||
'C': example['C'],
|
||||
'D': example['D'],
|
||||
'target': example['Answer']
|
||||
})
|
||||
|
||||
# Create a DatasetDict and return it
|
||||
dataset = DatasetDict({
|
||||
'dev': hf_dataset[f'{name}_{splits[0]}'],
|
||||
'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
|
||||
})
|
||||
return dataset
|
Loading…
Reference in New Issue
Block a user