From 41df5e5604ee82f28aa60ad868a6f0500c8a2926 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 4 May 2025 12:00:36 +0000 Subject: [PATCH 1/4] PromptCBLUE:Life Science dataset --- .../datasets/PromptCBLUE/PromptCBLUE_gen.py | 64 +++++++++++ .../PromptCBLUE/PromptCBLUE_llm_judge_gen.py | 102 ++++++++++++++++++ .../configs/datasets/PromptCBLUE/README.md | 0 opencompass/datasets/PromptCBLUE.py | 61 +++++++++++ opencompass/datasets/__init__.py | 1 + 5 files changed, 228 insertions(+) create mode 100644 opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py create mode 100644 opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py create mode 100644 opencompass/configs/datasets/PromptCBLUE/README.md create mode 100644 opencompass/datasets/PromptCBLUE.py diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py new file mode 100644 index 00000000..b81f5caa --- /dev/null +++ b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.utils.text_postprocessors import first_capital_postprocess +from opencompass.datasets import PromptCBLUEDataset + +# 1. 子数据集名称 +PromptCBLUE_lifescience_sets = [ + 'CHIP-CDN', 'CHIP-CTC', 'KUAKE-QIC', 'IMCS-V2-DAC', + 'CHIP-STS', 'KUAKE-QQR', 'KUAKE-IR', 'KUAKE-QTR' +] + +# 2. Reader 配置 +reader_cfg = dict( + input_columns=['input', 'answer_choices', 'options_str'], + output_column='target', + train_split='dev', +) + +# 3. Prompt 模板:末行固定 ANSWER: $LETTER +_HINT = 'Given the ICD-10 candidate terms below, choose the normalized term(s) matching the original diagnosis.' + +query_template = f"""{_HINT} + +Original diagnosis: {{input}} + +Options: +{{options_str}} + +The last line of your response must be exactly: +ANSWER: $LETTER +""".strip() + +infer_cfg_common = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt=query_template)]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# 4. 评估配置:与 MMLU 同款 +eval_cfg_common = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess), +) + +# 5. 组装数据集配置 +promptcblue_datasets = [] +for ds_name in PromptCBLUE_lifescience_sets: + promptcblue_datasets.append(dict( + abbr=f'promptcblue_{ds_name.lower().replace("-", "_")}_norm', + type=PromptCBLUEDataset, + path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + name=ds_name, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg_common, + eval_cfg=eval_cfg_common, + )) + +# ★ OpenCompass 识别的出口变量 +datasets = promptcblue_datasets diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py new file mode 100644 index 00000000..22de56a4 --- /dev/null +++ b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py @@ -0,0 +1,102 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import PromptCBLUEDataset + +PromptCBLUE_lifescience_sets = [ + 'CHIP-CDN', 'CHIP-CTC', 'KUAKE-QIC', 'IMCS-V2-DAC', + 'CHIP-STS', 'KUAKE-QQR', 'KUAKE-IR', 'KUAKE-QTR' +] +# Query template (keep original) +QUERY_TEMPLATE = """ +Given a medical diagnosis description and labeled ICD-10 candidate terms below, select the matching normalized term(s). +Original diagnosis: {input} + +Options: +{options_str} + +The last line of your response must be exactly in the format: +ANSWER: +""".strip() + +# Grader template (keep original) +GRADER_TEMPLATE = """ +As an expert evaluator, judge whether the candidate's answer matches the gold standard below. +Return 'A' for CORRECT or 'B' for INCORRECT, with no additional text. + +Original diagnosis: {input} + +Options: +{options_str} + +Gold answer: {target} + +Candidate answer: {prediction} +""".strip() + +# Common reader config +reader_cfg = dict( + input_columns=['input', 'answer_choices', 'options_str'], + output_column='target', + train_split='dev' +) + +# Assemble LLM evaluation datasets +promptcblue_llm_datasets = [] +for name in PromptCBLUE_lifescience_sets: + infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert judge for medical term normalization tasks.', + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ) + ), + dataset_cfg=dict( + type=PromptCBLUEDataset, + path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + name=name, + reader_cfg=reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', + ) + + promptcblue_llm_datasets.append( + dict( + abbr=f"promptcblue_{name.lower().replace('-', '_')}_norm_llm", + type=PromptCBLUEDataset, + path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + name=name, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + mode='singlescore', + ) + ) diff --git a/opencompass/configs/datasets/PromptCBLUE/README.md b/opencompass/configs/datasets/PromptCBLUE/README.md new file mode 100644 index 00000000..e69de29b diff --git a/opencompass/datasets/PromptCBLUE.py b/opencompass/datasets/PromptCBLUE.py new file mode 100644 index 00000000..0d71916e --- /dev/null +++ b/opencompass/datasets/PromptCBLUE.py @@ -0,0 +1,61 @@ +import json +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset # 保持与 MMLUDataset 同级的导包风格 + + +@LOAD_DATASET.register_module() +class PromptCBLUEDataset(BaseDataset): + """Loader for PromptCBLUE life-science tasks (CHIP-CDN, CHIP-CTC …). + + - 只读 `dev.json`。 + - 保留指定 `task_dataset` 的所有任务类型(包括 normalization、cls 等)。 + - 若 `target` 不在 `answer_choices`,自动追加;并生成 `options_str` + (形如 “A. 选项1\\nB. 选项2 …”)。 + - 返回 `DatasetDict`,将 dev 复制到 test 以满足评估流程。 + """ + + @staticmethod + def load(path: str, name: str, **kwargs): + path = get_data_path(path) + file_path = osp.join(path, 'dev.json') + if not osp.exists(file_path): + raise FileNotFoundError(f'`dev.json` not found under {path}') + + records = [] + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + rec = json.loads(line) + if rec.get('task_dataset') != name: + continue # 过滤子数据集 + + choices = rec.get('answer_choices', []).copy() + target = rec.get('target') + if target not in choices: + choices.append(target) + + options_str = '\n'.join(f'{chr(65+i)}. {opt}' + for i, opt in enumerate(choices)) + + records.append({ + 'input': rec['input'], + 'answer_choices': choices, + 'options_str': options_str, + 'target': target, + }) + + # 保证列完整,即使 records 为空 + if records: + ds = Dataset.from_list(records) + else: + ds = Dataset.from_dict({ + k: [] + for k in ['input', 'answer_choices', 'options_str', 'target'] + }) + dataset = DatasetDict(dev=ds, test=ds) # dev 与 test 指向同一份 + return dataset diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index b00162d1..c034715a 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -115,6 +115,7 @@ from .OlympiadBench import * # noqa: F401, F403 from .OpenFinData import * # noqa: F401, F403 from .physics import * # noqa: F401, F403 from .piqa import * # noqa: F401, F403 +from .PromptCBLUE import PromptCBLUEDataset # noqa: F401, F403 from .py150 import * # noqa: F401, F403 from .qasper import * # noqa: F401, F403 from .qaspercut import * # noqa: F401, F403 From 4f8c1a2078291f7ce894136243db4b735cb023df Mon Sep 17 00:00:00 2001 From: root Date: Tue, 6 May 2025 15:05:05 +0000 Subject: [PATCH 2/4] revise name: PromptCBLUE:Life Science dataset --- .../{PromptCBLUE_gen.py => PromptCBLUE_0shot_gen_b1eb29.py} | 0 ..._llm_judge_gen.py => PromptCBLUE_0shot_llmjudge_gen_2ee607.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename opencompass/configs/datasets/PromptCBLUE/{PromptCBLUE_gen.py => PromptCBLUE_0shot_gen_b1eb29.py} (100%) rename opencompass/configs/datasets/PromptCBLUE/{PromptCBLUE_llm_judge_gen.py => PromptCBLUE_0shot_llmjudge_gen_2ee607.py} (100%) diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py similarity index 100% rename from opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py rename to opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py similarity index 100% rename from opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llm_judge_gen.py rename to opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py From fba250a09430b36088204017a31fcc67b433d5f2 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 9 May 2025 06:47:26 +0000 Subject: [PATCH 3/4] PromptCBLUE:Life Science dataset+data --- dataset-index.yml | 6 ++ .../PromptCBLUE_0shot_gen_b1eb29.py | 6 +- .../PromptCBLUE_0shot_llmjudge_gen_2ee607.py | 6 +- opencompass/datasets/PromptCBLUE.py | 83 ++++++++++--------- 4 files changed, 54 insertions(+), 47 deletions(-) diff --git a/dataset-index.yml b/dataset-index.yml index 9585f97c..3e36a865 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -671,6 +671,12 @@ paper: https://arxiv.org/pdf/2009.03300 configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py +- PromptCBLUE: + name: PromptCBLUE + category: Understanding + paper: https://arxiv.org/pdf/2310.14151 + configpath: opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_gen.py + configpath_llmjudge: opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_llmjudge_gen.py - mmlu_cf: name: MMLU-CF category: Understanding diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py index b81f5caa..2480a4b6 100644 --- a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py +++ b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_gen_b1eb29.py @@ -15,7 +15,7 @@ PromptCBLUE_lifescience_sets = [ reader_cfg = dict( input_columns=['input', 'answer_choices', 'options_str'], output_column='target', - train_split='dev', + train_split='validation', ) # 3. Prompt 模板:末行固定 ANSWER: $LETTER @@ -41,7 +41,7 @@ infer_cfg_common = dict( inferencer=dict(type=GenInferencer), ) -# 4. 评估配置:与 MMLU 同款 +# 4. 评估配置 eval_cfg_common = dict( evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=first_capital_postprocess), @@ -53,7 +53,7 @@ for ds_name in PromptCBLUE_lifescience_sets: promptcblue_datasets.append(dict( abbr=f'promptcblue_{ds_name.lower().replace("-", "_")}_norm', type=PromptCBLUEDataset, - path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + path='tchenglv/PromptCBLUE', name=ds_name, reader_cfg=reader_cfg, infer_cfg=infer_cfg_common, diff --git a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py index 22de56a4..cf2d8e43 100644 --- a/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py +++ b/opencompass/configs/datasets/PromptCBLUE/PromptCBLUE_0shot_llmjudge_gen_2ee607.py @@ -41,7 +41,7 @@ Candidate answer: {prediction} reader_cfg = dict( input_columns=['input', 'answer_choices', 'options_str'], output_column='target', - train_split='dev' + train_split='validation' ) # Assemble LLM evaluation datasets @@ -78,7 +78,7 @@ for name in PromptCBLUE_lifescience_sets: ), dataset_cfg=dict( type=PromptCBLUEDataset, - path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + path='tchenglv/PromptCBLUE', name=name, reader_cfg=reader_cfg, ), @@ -92,7 +92,7 @@ for name in PromptCBLUE_lifescience_sets: dict( abbr=f"promptcblue_{name.lower().replace('-', '_')}_norm_llm", type=PromptCBLUEDataset, - path='/fs-computility/ai4sData/shared/lifescience/tangcheng/LifeScience/opencompass_val/datasets/PromptCBLUE', + path='tchenglv/PromptCBLUE', name=name, reader_cfg=reader_cfg, infer_cfg=infer_cfg, diff --git a/opencompass/datasets/PromptCBLUE.py b/opencompass/datasets/PromptCBLUE.py index 0d71916e..3266c6ea 100644 --- a/opencompass/datasets/PromptCBLUE.py +++ b/opencompass/datasets/PromptCBLUE.py @@ -1,10 +1,6 @@ -import json -import os.path as osp - -from datasets import Dataset, DatasetDict +from datasets import Dataset, DatasetDict, load_dataset from opencompass.registry import LOAD_DATASET -from opencompass.utils import get_data_path from .base import BaseDataset # 保持与 MMLUDataset 同级的导包风格 @@ -13,49 +9,54 @@ from .base import BaseDataset # 保持与 MMLUDataset 同级的导包风格 class PromptCBLUEDataset(BaseDataset): """Loader for PromptCBLUE life-science tasks (CHIP-CDN, CHIP-CTC …). - - 只读 `dev.json`。 - - 保留指定 `task_dataset` 的所有任务类型(包括 normalization、cls 等)。 - - 若 `target` 不在 `answer_choices`,自动追加;并生成 `options_str` - (形如 “A. 选项1\\nB. 选项2 …”)。 - - 返回 `DatasetDict`,将 dev 复制到 test 以满足评估流程。 + - 只读 validation split。 + - 保留指定 `task_dataset` 的所有任务类型。 + - 若 `target` 不在 `answer_choices`,自动追加;并生成 `options_str`。 + - 返回 `DatasetDict`,包含 `validation` 和 `test`,以满足评估流程。 """ @staticmethod def load(path: str, name: str, **kwargs): - path = get_data_path(path) - file_path = osp.join(path, 'dev.json') - if not osp.exists(file_path): - raise FileNotFoundError(f'`dev.json` not found under {path}') + # 1) 从 HuggingFace 读取 validation split + hf_ds = load_dataset(path, split='validation', **kwargs) + # 2) 过滤子数据集并构造记录 records = [] - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - rec = json.loads(line) - if rec.get('task_dataset') != name: - continue # 过滤子数据集 + for rec in hf_ds: + if rec.get('task_dataset') != name: + continue - choices = rec.get('answer_choices', []).copy() - target = rec.get('target') - if target not in choices: - choices.append(target) + choices = rec.get('answer_choices', []).copy() + target = rec.get('target') + if target not in choices: + choices.append(target) - options_str = '\n'.join(f'{chr(65+i)}. {opt}' - for i, opt in enumerate(choices)) + options_str = '\n'.join(f'{chr(65 + i)}. {opt}' + for i, opt in enumerate(choices)) - records.append({ - 'input': rec['input'], - 'answer_choices': choices, - 'options_str': options_str, - 'target': target, - }) - - # 保证列完整,即使 records 为空 - if records: - ds = Dataset.from_list(records) - else: - ds = Dataset.from_dict({ - k: [] - for k in ['input', 'answer_choices', 'options_str', 'target'] + records.append({ + 'input': rec['input'], + 'answer_choices': choices, + 'options_str': options_str, + 'target': target, }) - dataset = DatasetDict(dev=ds, test=ds) # dev 与 test 指向同一份 - return dataset + + # 3) 构造 Dataset + if records: + validation_ds = Dataset.from_list(records) + else: + validation_ds = Dataset.from_dict({ + k: [] + for k in [ + 'input', + 'answer_choices', + 'options_str', + 'target', + ] + }) + + # 4) 返回时包含 validation 和 test + return DatasetDict( + validation=validation_ds, + test=validation_ds, + ) From 8ceec5217066605fff0dc6f1f354ac6c62f96329 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 9 May 2025 07:13:58 +0000 Subject: [PATCH 4/4] PromptCBLUE:Life Science dataset+data --- opencompass/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index c034715a..65c13cf8 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -115,7 +115,7 @@ from .OlympiadBench import * # noqa: F401, F403 from .OpenFinData import * # noqa: F401, F403 from .physics import * # noqa: F401, F403 from .piqa import * # noqa: F401, F403 -from .PromptCBLUE import PromptCBLUEDataset # noqa: F401, F403 +from .PromptCBLUE import * # noqa: F401, F403 from .py150 import * # noqa: F401, F403 from .qasper import * # noqa: F401, F403 from .qaspercut import * # noqa: F401, F403