mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
![]() |
import json
|
|||
|
import os.path as osp
|
|||
|
|
|||
|
from datasets import Dataset, DatasetDict
|
|||
|
|
|||
|
from opencompass.registry import LOAD_DATASET
|
|||
|
from opencompass.utils import get_data_path
|
|||
|
|
|||
|
from .base import BaseDataset # 保持与 MMLUDataset 同级的导包风格
|
|||
|
|
|||
|
|
|||
|
@LOAD_DATASET.register_module()
|
|||
|
class PromptCBLUEDataset(BaseDataset):
|
|||
|
"""Loader for PromptCBLUE life-science tasks (CHIP-CDN, CHIP-CTC …).
|
|||
|
|
|||
|
- 只读 `dev.json`。
|
|||
|
- 保留指定 `task_dataset` 的所有任务类型(包括 normalization、cls 等)。
|
|||
|
- 若 `target` 不在 `answer_choices`,自动追加;并生成 `options_str`
|
|||
|
(形如 “A. 选项1\\nB. 选项2 …”)。
|
|||
|
- 返回 `DatasetDict`,将 dev 复制到 test 以满足评估流程。
|
|||
|
"""
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def load(path: str, name: str, **kwargs):
|
|||
|
path = get_data_path(path)
|
|||
|
file_path = osp.join(path, 'dev.json')
|
|||
|
if not osp.exists(file_path):
|
|||
|
raise FileNotFoundError(f'`dev.json` not found under {path}')
|
|||
|
|
|||
|
records = []
|
|||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|||
|
for line in f:
|
|||
|
rec = json.loads(line)
|
|||
|
if rec.get('task_dataset') != name:
|
|||
|
continue # 过滤子数据集
|
|||
|
|
|||
|
choices = rec.get('answer_choices', []).copy()
|
|||
|
target = rec.get('target')
|
|||
|
if target not in choices:
|
|||
|
choices.append(target)
|
|||
|
|
|||
|
options_str = '\n'.join(f'{chr(65+i)}. {opt}'
|
|||
|
for i, opt in enumerate(choices))
|
|||
|
|
|||
|
records.append({
|
|||
|
'input': rec['input'],
|
|||
|
'answer_choices': choices,
|
|||
|
'options_str': options_str,
|
|||
|
'target': target,
|
|||
|
})
|
|||
|
|
|||
|
# 保证列完整,即使 records 为空
|
|||
|
if records:
|
|||
|
ds = Dataset.from_list(records)
|
|||
|
else:
|
|||
|
ds = Dataset.from_dict({
|
|||
|
k: []
|
|||
|
for k in ['input', 'answer_choices', 'options_str', 'target']
|
|||
|
})
|
|||
|
dataset = DatasetDict(dev=ds, test=ds) # dev 与 test 指向同一份
|
|||
|
return dataset
|