diff --git a/configs/eval_mmlu_cf.py b/configs/eval_mmlu_cf.py new file mode 100644 index 00000000..adb445ae --- /dev/null +++ b/configs/eval_mmlu_cf.py @@ -0,0 +1,38 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import mmlu_cf_datasets + + from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import models as hf_qwen2_5_7b_instruct_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + + from opencompass.configs.summarizers.mmlu_cf import summarizer + + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=8, + task=dict(type=OpenICLInferTask) + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=256, + task=dict(type=OpenICLEvalTask) + ), +) + +work_dir = 'outputs/debug/mmlu_cf' diff --git a/configs/summarizers/groups/mmlu_cf.py b/configs/summarizers/groups/mmlu_cf.py new file mode 100644 index 00000000..3e0b8b25 --- /dev/null +++ b/configs/summarizers/groups/mmlu_cf.py @@ -0,0 +1,5 @@ +categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History'] + +mmlu_cf_summary_groups = [ + {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, +] diff --git a/configs/summarizers/mmlu_cf.py b/configs/summarizers/mmlu_cf.py new file mode 100644 index 00000000..f5d3e7a9 --- /dev/null +++ b/configs/summarizers/mmlu_cf.py @@ -0,0 +1,25 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.mmlu_cf import mmlu_cf_summary_groups + +summarizer = dict( + dataset_abbrs=[ + 'mmlu_cf_Biology', + 'mmlu_cf_Business', + 'mmlu_cf_Chemistry', + 'mmlu_cf_Computer_Science', + 'mmlu_cf_Economics', + 'mmlu_cf_Engineering', + 'mmlu_cf_Health', + 'mmlu_cf_History', + 'mmlu_cf_Law', + 'mmlu_cf_Math', + 'mmlu_cf_Philosophy', + 'mmlu_cf_Physics', + 'mmlu_cf_Psychology', + 'mmlu_cf_Other', + 'mmlu_cf', + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py new file mode 100644 index 00000000..ab8b198f --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'Math', + 'Physics', + 'Chemistry', + 'Law', + 'Engineering', + 'Other', + 'Economics', + 'Health', + 'Psychology', + 'Business', + 'Biology', + 'Philosophy', + 'Computer_Science', + 'History', +] diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py new file mode 100644 index 00000000..6500cee7 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py new file mode 100644 index 00000000..5fbee8d9 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py new file mode 100644 index 00000000..851fec91 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py new file mode 100644 index 00000000..d084f4f0 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/summarizers/groups/mmlu_cf.py b/opencompass/configs/summarizers/groups/mmlu_cf.py new file mode 100644 index 00000000..3e0b8b25 --- /dev/null +++ b/opencompass/configs/summarizers/groups/mmlu_cf.py @@ -0,0 +1,5 @@ +categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History'] + +mmlu_cf_summary_groups = [ + {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, +] diff --git a/opencompass/configs/summarizers/mmlu_cf.py b/opencompass/configs/summarizers/mmlu_cf.py new file mode 100644 index 00000000..f5d3e7a9 --- /dev/null +++ b/opencompass/configs/summarizers/mmlu_cf.py @@ -0,0 +1,25 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.mmlu_cf import mmlu_cf_summary_groups + +summarizer = dict( + dataset_abbrs=[ + 'mmlu_cf_Biology', + 'mmlu_cf_Business', + 'mmlu_cf_Chemistry', + 'mmlu_cf_Computer_Science', + 'mmlu_cf_Economics', + 'mmlu_cf_Engineering', + 'mmlu_cf_Health', + 'mmlu_cf_History', + 'mmlu_cf_Law', + 'mmlu_cf_Math', + 'mmlu_cf_Philosophy', + 'mmlu_cf_Physics', + 'mmlu_cf_Psychology', + 'mmlu_cf_Other', + 'mmlu_cf', + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 58861a8c..e061286f 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -92,6 +92,7 @@ from .mbpp import * # noqa: F401, F403 from .medbench import * # noqa: F401, F403 from .mgsm import * # noqa: F401, F403 from .mmlu import * # noqa: F401, F403 +from .mmlu_cf import * # noqa: F401, F403 from .mmlu_pro import * # noqa: F401, F403 from .MMLUArabic import * # noqa: F401, F403 from .mmmlu import * # noqa: F401, F403 diff --git a/opencompass/datasets/mmlu_cf.py b/opencompass/datasets/mmlu_cf.py new file mode 100644 index 00000000..f3ee8685 --- /dev/null +++ b/opencompass/datasets/mmlu_cf.py @@ -0,0 +1,41 @@ +from datasets import DatasetDict, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MMLUCFDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + """Loading HuggingFace datasets.""" + # Use HuggingFace's load_dataset method to load the dataset + hf_dataset = load_dataset(path) + columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer'] + hf_dataset = hf_dataset.map( + lambda x: {key: x[key] + for key in columns_to_keep}) + splits = ['dev', 'val'] + + for split in splits: + sub_set = f'{name}_{split}' + + # Rename fields here if they don't match the expected names + hf_dataset[sub_set] = hf_dataset[sub_set].map( + lambda example: { + 'input': example['Question'], + 'A': example['A'], + 'B': example['B'], + 'C': example['C'], + 'D': example['D'], + 'target': example['Answer'] + }) + + # Create a DatasetDict and return it + dataset = DatasetDict({ + 'dev': hf_dataset[f'{name}_{splits[0]}'], + 'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' + }) + return dataset