Fix bugs for MedQA. Add info in dataset-index

This commit is contained in:
Yejin0111 2025-05-08 14:41:15 +00:00
parent 63f80134c8
commit d28e3e4c80
5 changed files with 106 additions and 75 deletions

View File

@ -122,6 +122,12 @@
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- MedXpertQA:
name: MedQA
category: Knowledge / Medicine
paper: https://arxiv.org/abs/2009.13081
configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py
configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py
- MedXpertQA: - MedXpertQA:
name: MedXpertQA name: MedXpertQA
category: Knowledge / Medicine category: Knowledge / Medicine
@ -739,6 +745,12 @@
paper: https://arxiv.org/pdf/1911.11641v1 paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa/piqa_gen.py configpath: opencompass/configs/datasets/piqa/piqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- ProteinLMBench:
name: ProteinLMBench
category: Knowledge / Biology (Protein)
paper: https://arxiv.org/abs/2406.05540
configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py
configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py
- py150: - py150:
name: py150 name: py150
category: Code category: Code

View File

@ -5,37 +5,59 @@ from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.utils.text_postprocessors import first_option_postprocess from opencompass.utils.text_postprocessors import first_option_postprocess
from opencompass.datasets.MedQA import MedQADataset from opencompass.datasets.MedQA import MedQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
MedQA_datasets = []
MedQA_reader_cfg = dict( MedQA_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D', 'choices'], input_columns=['question', 'choices'],
output_column='label', output_column='label',
test_split='validation') )
MedQA_infer_cfg = dict( MedQA_infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template=dict( template=dict(
round=[ round=[
dict( dict(role='HUMAN', prompt=QUERY_TEMPLATE),
role='HUMAN', ],
prompt='\nQuestion: {question}\n{choices}\nAnswer:' ),
)
], ),
), ),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer), inferencer=dict(type=GenInferencer),
) )
MedQA_eval_cfg = dict( MedQA_subsets = {
evaluator=dict(type=AccEvaluator), 'US': 'xuxuxuxuxu/MedQA_US_test',
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') 'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
) 'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
}
MedQA_datasets = [ for split in list(MedQA_subsets.keys()):
dict(
abbr='MedQA', MedQA_eval_cfg = dict(
type=MedQADataset, evaluator=dict(type=AccEvaluator),
path='opencompass/MedQA', pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
reader_cfg=MedQA_reader_cfg, )
infer_cfg=MedQA_infer_cfg,
eval_cfg=MedQA_eval_cfg) MedQA_datasets.append(
] dict(
abbr=f'MedQA_{split}',
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
infer_cfg=MedQA_infer_cfg,
eval_cfg=MedQA_eval_cfg,
)
)

View File

@ -43,9 +43,8 @@ GRADER_TEMPLATE = """
MedQA_datasets = [] MedQA_datasets = []
MedQA_reader_cfg = dict( MedQA_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D', 'choices'], input_columns=['question', 'choices'],
output_column='label', output_column='label',
test_split='validation',
) )
MedQA_infer_cfg = dict( MedQA_infer_cfg = dict(
@ -61,41 +60,49 @@ MedQA_infer_cfg = dict(
inferencer=dict(type=GenInferencer), inferencer=dict(type=GenInferencer),
) )
MedQA_eval_cfg = dict( MedQA_subsets = {
evaluator=dict( 'US': 'xuxuxuxuxu/MedQA_US_test',
type=GenericLLMEvaluator, 'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
prompt_template=dict( 'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
type=PromptTemplate, }
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MedQADataset,
path='opencompass/MedQA',
reader_cfg=MedQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
MedQA_datasets.append( for split in list(MedQA_subsets.keys()):
dict(
abbr=f'MedQA', MedQA_eval_cfg = dict(
type=MedQADataset, evaluator=dict(
path='opencompass/MedQA', type=GenericLLMEvaluator,
reader_cfg=MedQA_reader_cfg, prompt_template=dict(
infer_cfg=MedQA_infer_cfg, type=PromptTemplate,
eval_cfg=MedQA_eval_cfg, template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
MedQA_datasets.append(
dict(
abbr=f'MedQA_{split}',
type=MedQADataset,
path=MedQA_subsets[split],
reader_cfg=MedQA_reader_cfg,
infer_cfg=MedQA_infer_cfg,
eval_cfg=MedQA_eval_cfg,
)
) )
)

View File

@ -1,4 +1,4 @@
from datasets import Dataset, DatasetDict, load_dataset from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET
@ -11,18 +11,13 @@ class MedQADataset(BaseDataset):
@staticmethod @staticmethod
def load_single(path): def load_single(path):
dataset = [] dataset = []
data_lines = load_dataset(path, 'test') # "data/MedQA" ds = load_dataset(path)
num = 0 for data in ds['train']:
for data in data_lines: data['label'] = data['answer_idx']
num += 1
choices = '' choices = ''
for i in range(4): for option in data['options']:
data[chr(65 + i)] = data['ending' + str(i)] choices += option + '. ' + data['options'][option] + '\n'
choices += chr(65 + i) + '. ' + data['ending' + str(i)] + '\n'
data['question'] = data['sent1']
data['choices'] = choices data['choices'] = choices
data['label'] = chr(65 + int(data['label'])) + '. ' + data[
'ending' + str(data['label'])]
dataset.append(data) dataset.append(data)
@ -30,10 +25,5 @@ class MedQADataset(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
train_dataset = Dataset.from_list([]) dataset = MedQADataset.load_single(path)
val_dataset = MedQADataset.load_single(path) # "data/MedQA/test.json"
dataset = DatasetDict({
'train': train_dataset,
'validation': val_dataset
})
return dataset return dataset