[Dataset] Add CMB (#376)

* Add CMB

* modify CMB

---------

Co-authored-by: wangxidong <xidongw@163.com>
This commit is contained in:
Xidong Wang 2023-09-12 19:16:41 +08:00 committed by GitHub
parent 4d89533fbc
commit 47a752cd56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 0 deletions

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .cmb_gen_72cbb7 import cmb_datasets # noqa: F401, F403

View File

@ -0,0 +1,43 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CMBDataset
cmb_datasets = []
cmb_reader_cfg = dict(
input_columns=["exam_type", "exam_class", "question_type", "question", "option_str"],
output_column=None,
train_split="val",
test_split="test"
)
cmb_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(
role="HUMAN",
prompt=f"以下是中国{{exam_type}}{{exam_class}}考试的一道{{question_type}},不需要做任何分析和解释,直接输出答案选项。\n{{question}}\n{{option_str}} \n 答案: ",
),
dict(role="BOT", prompt="{answer}"),
],
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever),
inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
)
cmb_datasets.append(
dict(
type=CMBDataset,
path="./data/CMB/",
abbr="cmb",
reader_cfg=cmb_reader_cfg,
infer_cfg=cmb_infer_cfg
)
)

View File

@ -13,6 +13,7 @@ from .ceval import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403
from .cluewsc import * # noqa: F401, F403
from .cmb import * # noqa: F401, F403
from .cmmlu import * # noqa: F401, F403
from .cmnli import * # noqa: F401, F403
from .cmrc import * # noqa: F401, F403

View File

@ -0,0 +1,30 @@
import json
import os.path as osp
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class CMBDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(osp.join(path, 'test.json'), 'r') as f:
test_data = json.load(f)
with open(osp.join(path, 'val.json'), 'r') as f:
val_data = json.load(f)
for da in test_data:
da['option_str'] = '\n'.join(
[f'{k}. {v}' for k, v in da['option'].items() if len(v) > 1])
for da in val_data:
da['option_str'] = '\n'.join(
[f'{k}. {v}' for k, v in da['option'].items() if len(v) > 1])
test_dataset = Dataset.from_list(test_data)
val_dataset = Dataset.from_list(val_data)
return DatasetDict({'test': test_dataset, 'val': val_dataset})