From 47a752cd56f5a1d3c1c8d888b20d2ba46e51c4bf Mon Sep 17 00:00:00 2001 From: Xidong Wang <73694546+wangxidong06@users.noreply.github.com> Date: Tue, 12 Sep 2023 19:16:41 +0800 Subject: [PATCH] [Dataset] Add CMB (#376) * Add CMB * modify CMB --------- Co-authored-by: wangxidong --- configs/datasets/cmb/cmb_gen.py | 4 +++ configs/datasets/cmb/cmb_gen_72cbb7.py | 43 ++++++++++++++++++++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/cmb.py | 30 ++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 configs/datasets/cmb/cmb_gen.py create mode 100644 configs/datasets/cmb/cmb_gen_72cbb7.py create mode 100644 opencompass/datasets/cmb.py diff --git a/configs/datasets/cmb/cmb_gen.py b/configs/datasets/cmb/cmb_gen.py new file mode 100644 index 00000000..5d379ad1 --- /dev/null +++ b/configs/datasets/cmb/cmb_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .cmb_gen_72cbb7 import cmb_datasets # noqa: F401, F403 diff --git a/configs/datasets/cmb/cmb_gen_72cbb7.py b/configs/datasets/cmb/cmb_gen_72cbb7.py new file mode 100644 index 00000000..4cb9a325 --- /dev/null +++ b/configs/datasets/cmb/cmb_gen_72cbb7.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CMBDataset + + +cmb_datasets = [] + +cmb_reader_cfg = dict( + input_columns=["exam_type", "exam_class", "question_type", "question", "option_str"], + output_column=None, + train_split="val", + test_split="test" +) + +cmb_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin="", + round=[ + dict( + role="HUMAN", + prompt=f"以下是中国{{exam_type}}中{{exam_class}}考试的一道{{question_type}},不需要做任何分析和解释,直接输出答案选项。\n{{question}}\n{{option_str}} \n 答案: ", + ), + dict(role="BOT", prompt="{answer}"), + ], + ), + ice_token="", + ), + retriever=dict(type=FixKRetriever), + inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]), +) + +cmb_datasets.append( + dict( + type=CMBDataset, + path="./data/CMB/", + abbr="cmb", + reader_cfg=cmb_reader_cfg, + infer_cfg=cmb_infer_cfg + ) +) \ No newline at end of file diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index cac3dd72..041309cb 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -13,6 +13,7 @@ from .ceval import * # noqa: F401, F403 from .chid import * # noqa: F401, F403 from .civilcomments import * # noqa: F401, F403 from .cluewsc import * # noqa: F401, F403 +from .cmb import * # noqa: F401, F403 from .cmmlu import * # noqa: F401, F403 from .cmnli import * # noqa: F401, F403 from .cmrc import * # noqa: F401, F403 diff --git a/opencompass/datasets/cmb.py b/opencompass/datasets/cmb.py new file mode 100644 index 00000000..5f53ec14 --- /dev/null +++ b/opencompass/datasets/cmb.py @@ -0,0 +1,30 @@ +import json +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class CMBDataset(BaseDataset): + + @staticmethod + def load(path: str): + with open(osp.join(path, 'test.json'), 'r') as f: + test_data = json.load(f) + with open(osp.join(path, 'val.json'), 'r') as f: + val_data = json.load(f) + + for da in test_data: + da['option_str'] = '\n'.join( + [f'{k}. {v}' for k, v in da['option'].items() if len(v) > 1]) + for da in val_data: + da['option_str'] = '\n'.join( + [f'{k}. {v}' for k, v in da['option'].items() if len(v) > 1]) + + test_dataset = Dataset.from_list(test_data) + val_dataset = Dataset.from_list(val_data) + return DatasetDict({'test': test_dataset, 'val': val_dataset})