From d28e3e4c8079ca709c24b8a8217abb812520f27f Mon Sep 17 00:00:00 2001 From: Yejin0111 Date: Thu, 8 May 2025 14:41:15 +0000 Subject: [PATCH] Fix bugs for MedQA. Add info in dataset-index --- dataset-index.yml | 12 +++ .../configs/datasets/MedQA/MedQA_gen.py | 62 +++++++++----- ...llm_judge_gen.py => MedQA_llmjudge_gen.py} | 83 ++++++++++--------- ..._gen.py => ProteinLMBench_llmjudge_gen.py} | 0 opencompass/datasets/MedQA.py | 24 ++---- 5 files changed, 106 insertions(+), 75 deletions(-) rename opencompass/configs/datasets/MedQA/{MedQA_llm_judge_gen.py => MedQA_llmjudge_gen.py} (72%) rename opencompass/configs/datasets/ProteinLMBench/{ProteinLMBench_llm_judge_gen.py => ProteinLMBench_llmjudge_gen.py} (100%) diff --git a/dataset-index.yml b/dataset-index.yml index 9585f97c..f08873f3 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -122,6 +122,12 @@ paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 configpath: opencompass/configs/datasets/MedBench/medbench_gen.py configpath_llmjudge: '' +- MedXpertQA: + name: MedQA + category: Knowledge / Medicine + paper: https://arxiv.org/abs/2009.13081 + configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py + configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py - MedXpertQA: name: MedXpertQA category: Knowledge / Medicine @@ -739,6 +745,12 @@ paper: https://arxiv.org/pdf/1911.11641v1 configpath: opencompass/configs/datasets/piqa/piqa_gen.py configpath_llmjudge: '' +- ProteinLMBench: + name: ProteinLMBench + category: Knowledge / Biology (Protein) + paper: https://arxiv.org/abs/2406.05540 + configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py + configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py - py150: name: py150 category: Code diff --git a/opencompass/configs/datasets/MedQA/MedQA_gen.py b/opencompass/configs/datasets/MedQA/MedQA_gen.py index 95ace53a..01306134 100644 --- a/opencompass/configs/datasets/MedQA/MedQA_gen.py +++ b/opencompass/configs/datasets/MedQA/MedQA_gen.py @@ -5,37 +5,59 @@ from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.utils.text_postprocessors import first_option_postprocess from opencompass.datasets.MedQA import MedQADataset + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. + +Question:\n +{question} + +Options:\n +{choices} + +""".strip() + + +MedQA_datasets = [] + MedQA_reader_cfg = dict( - input_columns=['question', 'A', 'B', 'C', 'D', 'choices'], + input_columns=['question', 'choices'], output_column='label', - test_split='validation') +) MedQA_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, template=dict( round=[ - dict( - role='HUMAN', - prompt='\nQuestion: {question}\n{choices}\nAnswer:' - ) - ], ), + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), ), retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer), ) -MedQA_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') -) +MedQA_subsets = { + 'US': 'xuxuxuxuxu/MedQA_US_test', + 'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test', + 'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test', +} -MedQA_datasets = [ - dict( - abbr='MedQA', - type=MedQADataset, - path='opencompass/MedQA', - reader_cfg=MedQA_reader_cfg, - infer_cfg=MedQA_infer_cfg, - eval_cfg=MedQA_eval_cfg) -] +for split in list(MedQA_subsets.keys()): + + MedQA_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') + ) + + MedQA_datasets.append( + dict( + abbr=f'MedQA_{split}', + type=MedQADataset, + path=MedQA_subsets[split], + reader_cfg=MedQA_reader_cfg, + infer_cfg=MedQA_infer_cfg, + eval_cfg=MedQA_eval_cfg, + ) + ) diff --git a/opencompass/configs/datasets/MedQA/MedQA_llm_judge_gen.py b/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py similarity index 72% rename from opencompass/configs/datasets/MedQA/MedQA_llm_judge_gen.py rename to opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py index d36a8bee..d6c19119 100644 --- a/opencompass/configs/datasets/MedQA/MedQA_llm_judge_gen.py +++ b/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py @@ -43,9 +43,8 @@ GRADER_TEMPLATE = """ MedQA_datasets = [] MedQA_reader_cfg = dict( - input_columns=['question', 'A', 'B', 'C', 'D', 'choices'], + input_columns=['question', 'choices'], output_column='label', - test_split='validation', ) MedQA_infer_cfg = dict( @@ -61,41 +60,49 @@ MedQA_infer_cfg = dict( inferencer=dict(type=GenInferencer), ) -MedQA_eval_cfg = dict( - evaluator=dict( - type=GenericLLMEvaluator, - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[ - dict( - role='SYSTEM', - fallback_role='HUMAN', - prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", - ) - ], - round=[ - dict(role='HUMAN', prompt=GRADER_TEMPLATE), - ], - ), - ), - dataset_cfg=dict( - type=MedQADataset, - path='opencompass/MedQA', - reader_cfg=MedQA_reader_cfg, - ), - judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess), - ), -) +MedQA_subsets = { + 'US': 'xuxuxuxuxu/MedQA_US_test', + 'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test', + 'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test', +} -MedQA_datasets.append( - dict( - abbr=f'MedQA', - type=MedQADataset, - path='opencompass/MedQA', - reader_cfg=MedQA_reader_cfg, - infer_cfg=MedQA_infer_cfg, - eval_cfg=MedQA_eval_cfg, +for split in list(MedQA_subsets.keys()): + + MedQA_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MedQADataset, + path=MedQA_subsets[split], + reader_cfg=MedQA_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ) + + MedQA_datasets.append( + dict( + abbr=f'MedQA_{split}', + type=MedQADataset, + path=MedQA_subsets[split], + reader_cfg=MedQA_reader_cfg, + infer_cfg=MedQA_infer_cfg, + eval_cfg=MedQA_eval_cfg, + ) ) -) diff --git a/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llm_judge_gen.py b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py similarity index 100% rename from opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llm_judge_gen.py rename to opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py diff --git a/opencompass/datasets/MedQA.py b/opencompass/datasets/MedQA.py index 7bbe461c..256f9910 100644 --- a/opencompass/datasets/MedQA.py +++ b/opencompass/datasets/MedQA.py @@ -1,4 +1,4 @@ -from datasets import Dataset, DatasetDict, load_dataset +from datasets import Dataset, load_dataset from opencompass.registry import LOAD_DATASET @@ -11,18 +11,13 @@ class MedQADataset(BaseDataset): @staticmethod def load_single(path): dataset = [] - data_lines = load_dataset(path, 'test') # "data/MedQA" - num = 0 - for data in data_lines: - num += 1 + ds = load_dataset(path) + for data in ds['train']: + data['label'] = data['answer_idx'] choices = '' - for i in range(4): - data[chr(65 + i)] = data['ending' + str(i)] - choices += chr(65 + i) + '. ' + data['ending' + str(i)] + '\n' - data['question'] = data['sent1'] + for option in data['options']: + choices += option + '. ' + data['options'][option] + '\n' data['choices'] = choices - data['label'] = chr(65 + int(data['label'])) + '. ' + data[ - 'ending' + str(data['label'])] dataset.append(data) @@ -30,10 +25,5 @@ class MedQADataset(BaseDataset): @staticmethod def load(path): - train_dataset = Dataset.from_list([]) - val_dataset = MedQADataset.load_single(path) # "data/MedQA/test.json" - dataset = DatasetDict({ - 'train': train_dataset, - 'validation': val_dataset - }) + dataset = MedQADataset.load_single(path) return dataset