PromptCBLUE:Life Science dataset

This commit is contained in:
root 2025-05-04 12:00:36 +00:00
parent 8c74e6a39e
commit 41df5e5604
5 changed files with 228 additions and 0 deletions

View File

@ -0,0 +1,64 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess
from opencompass.datasets import PromptCBLUEDataset
# 1. 子数据集名称
PromptCBLUE_lifescience_sets = [
'CHIP-CDN', 'CHIP-CTC', 'KUAKE-QIC', 'IMCS-V2-DAC',
'CHIP-STS', 'KUAKE-QQR', 'KUAKE-IR', 'KUAKE-QTR'
]
# 2. Reader 配置
reader_cfg = dict(
input_columns=['input', 'answer_choices', 'options_str'],
output_column='target',
train_split='dev',
)
# 3. Prompt 模板:末行固定 ANSWER: $LETTER
_HINT = 'Given the ICD-10 candidate terms below, choose the normalized term(s) matching the original diagnosis.'
query_template = f"""{_HINT}
Original diagnosis: {{input}}
Options:
{{options_str}}
The last line of your response must be exactly:
ANSWER: $LETTER
""".strip()
infer_cfg_common = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt=query_template)]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# 4. 评估配置:与 MMLU 同款
eval_cfg_common = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess),
)
# 5. 组装数据集配置
promptcblue_datasets = []
for ds_name in PromptCBLUE_lifescience_sets:
promptcblue_datasets.append(dict(
abbr=f'promptcblue_{ds_name.lower().replace("-", "_")}_norm',
type=PromptCBLUEDataset,
path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE',
name=ds_name,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg_common,
eval_cfg=eval_cfg_common,
))
# ★ OpenCompass 识别的出口变量
datasets = promptcblue_datasets

View File

@ -0,0 +1,102 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import PromptCBLUEDataset
PromptCBLUE_lifescience_sets = [
'CHIP-CDN', 'CHIP-CTC', 'KUAKE-QIC', 'IMCS-V2-DAC',
'CHIP-STS', 'KUAKE-QQR', 'KUAKE-IR', 'KUAKE-QTR'
]
# Query template (keep original)
QUERY_TEMPLATE = """
Given a medical diagnosis description and labeled ICD-10 candidate terms below, select the matching normalized term(s).
Original diagnosis: {input}
Options:
{options_str}
The last line of your response must be exactly in the format:
ANSWER: <LETTER(S)>
""".strip()
# Grader template (keep original)
GRADER_TEMPLATE = """
As an expert evaluator, judge whether the candidate's answer matches the gold standard below.
Return 'A' for CORRECT or 'B' for INCORRECT, with no additional text.
Original diagnosis: {input}
Options:
{options_str}
Gold answer: {target}
Candidate answer: {prediction}
""".strip()
# Common reader config
reader_cfg = dict(
input_columns=['input', 'answer_choices', 'options_str'],
output_column='target',
train_split='dev'
)
# Assemble LLM evaluation datasets
promptcblue_llm_datasets = []
for name in PromptCBLUE_lifescience_sets:
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert judge for medical term normalization tasks.',
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
)
),
dataset_cfg=dict(
type=PromptCBLUEDataset,
path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE',
name=name,
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
promptcblue_llm_datasets.append(
dict(
abbr=f"promptcblue_{name.lower().replace('-', '_')}_norm_llm",
type=PromptCBLUEDataset,
path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE',
name=name,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
mode='singlescore',
)
)

View File

@ -0,0 +1,61 @@
import json
import os.path as osp
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset # 保持与 MMLUDataset 同级的导包风格
@LOAD_DATASET.register_module()
class PromptCBLUEDataset(BaseDataset):
"""Loader for PromptCBLUE life-science tasks (CHIP-CDN, CHIP-CTC …).
- 只读 `dev.json`
- 保留指定 `task_dataset` 的所有任务类型包括 normalizationcls
- `target` 不在 `answer_choices`自动追加并生成 `options_str`
形如 A. 选项1\\nB. 选项2
- 返回 `DatasetDict` dev 复制到 test 以满足评估流程
"""
@staticmethod
def load(path: str, name: str, **kwargs):
path = get_data_path(path)
file_path = osp.join(path, 'dev.json')
if not osp.exists(file_path):
raise FileNotFoundError(f'`dev.json` not found under {path}')
records = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
rec = json.loads(line)
if rec.get('task_dataset') != name:
continue # 过滤子数据集
choices = rec.get('answer_choices', []).copy()
target = rec.get('target')
if target not in choices:
choices.append(target)
options_str = '\n'.join(f'{chr(65+i)}. {opt}'
for i, opt in enumerate(choices))
records.append({
'input': rec['input'],
'answer_choices': choices,
'options_str': options_str,
'target': target,
})
# 保证列完整,即使 records 为空
if records:
ds = Dataset.from_list(records)
else:
ds = Dataset.from_dict({
k: []
for k in ['input', 'answer_choices', 'options_str', 'target']
})
dataset = DatasetDict(dev=ds, test=ds) # dev 与 test 指向同一份
return dataset

View File

@ -115,6 +115,7 @@ from .OlympiadBench import * # noqa: F401, F403
from .OpenFinData import * # noqa: F401, F403
from .physics import * # noqa: F401, F403
from .piqa import * # noqa: F401, F403
from .PromptCBLUE import PromptCBLUEDataset # noqa: F401, F403
from .py150 import * # noqa: F401, F403
from .qasper import * # noqa: F401, F403
from .qaspercut import * # noqa: F401, F403