mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Fix bugs for MedQA. Add info in dataset-index
This commit is contained in:
parent
63f80134c8
commit
d28e3e4c80
@ -122,6 +122,12 @@
|
|||||||
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
||||||
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
|
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- MedXpertQA:
|
||||||
|
name: MedQA
|
||||||
|
category: Knowledge / Medicine
|
||||||
|
paper: https://arxiv.org/abs/2009.13081
|
||||||
|
configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py
|
||||||
|
configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py
|
||||||
- MedXpertQA:
|
- MedXpertQA:
|
||||||
name: MedXpertQA
|
name: MedXpertQA
|
||||||
category: Knowledge / Medicine
|
category: Knowledge / Medicine
|
||||||
@ -739,6 +745,12 @@
|
|||||||
paper: https://arxiv.org/pdf/1911.11641v1
|
paper: https://arxiv.org/pdf/1911.11641v1
|
||||||
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
|
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- ProteinLMBench:
|
||||||
|
name: ProteinLMBench
|
||||||
|
category: Knowledge / Biology (Protein)
|
||||||
|
paper: https://arxiv.org/abs/2406.05540
|
||||||
|
configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py
|
||||||
|
configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py
|
||||||
- py150:
|
- py150:
|
||||||
name: py150
|
name: py150
|
||||||
category: Code
|
category: Code
|
||||||
|
@ -5,37 +5,59 @@ from opencompass.openicl.icl_evaluator import AccEvaluator
|
|||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
from opencompass.datasets.MedQA import MedQADataset
|
from opencompass.datasets.MedQA import MedQADataset
|
||||||
|
|
||||||
|
|
||||||
|
QUERY_TEMPLATE = """
|
||||||
|
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||||
|
|
||||||
|
Question:\n
|
||||||
|
{question}
|
||||||
|
|
||||||
|
Options:\n
|
||||||
|
{choices}
|
||||||
|
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
MedQA_datasets = []
|
||||||
|
|
||||||
MedQA_reader_cfg = dict(
|
MedQA_reader_cfg = dict(
|
||||||
input_columns=['question', 'A', 'B', 'C', 'D', 'choices'],
|
input_columns=['question', 'choices'],
|
||||||
output_column='label',
|
output_column='label',
|
||||||
test_split='validation')
|
)
|
||||||
|
|
||||||
MedQA_infer_cfg = dict(
|
MedQA_infer_cfg = dict(
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(
|
template=dict(
|
||||||
round=[
|
round=[
|
||||||
dict(
|
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||||
role='HUMAN',
|
],
|
||||||
prompt='\nQuestion: {question}\n{choices}\nAnswer:'
|
),
|
||||||
)
|
|
||||||
], ),
|
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
MedQA_eval_cfg = dict(
|
MedQA_subsets = {
|
||||||
evaluator=dict(type=AccEvaluator),
|
'US': 'xuxuxuxuxu/MedQA_US_test',
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
|
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
|
||||||
)
|
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
|
||||||
|
}
|
||||||
|
|
||||||
MedQA_datasets = [
|
for split in list(MedQA_subsets.keys()):
|
||||||
dict(
|
|
||||||
abbr='MedQA',
|
MedQA_eval_cfg = dict(
|
||||||
type=MedQADataset,
|
evaluator=dict(type=AccEvaluator),
|
||||||
path='opencompass/MedQA',
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
|
||||||
reader_cfg=MedQA_reader_cfg,
|
)
|
||||||
infer_cfg=MedQA_infer_cfg,
|
|
||||||
eval_cfg=MedQA_eval_cfg)
|
MedQA_datasets.append(
|
||||||
]
|
dict(
|
||||||
|
abbr=f'MedQA_{split}',
|
||||||
|
type=MedQADataset,
|
||||||
|
path=MedQA_subsets[split],
|
||||||
|
reader_cfg=MedQA_reader_cfg,
|
||||||
|
infer_cfg=MedQA_infer_cfg,
|
||||||
|
eval_cfg=MedQA_eval_cfg,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -43,9 +43,8 @@ GRADER_TEMPLATE = """
|
|||||||
MedQA_datasets = []
|
MedQA_datasets = []
|
||||||
|
|
||||||
MedQA_reader_cfg = dict(
|
MedQA_reader_cfg = dict(
|
||||||
input_columns=['question', 'A', 'B', 'C', 'D', 'choices'],
|
input_columns=['question', 'choices'],
|
||||||
output_column='label',
|
output_column='label',
|
||||||
test_split='validation',
|
|
||||||
)
|
)
|
||||||
|
|
||||||
MedQA_infer_cfg = dict(
|
MedQA_infer_cfg = dict(
|
||||||
@ -61,41 +60,49 @@ MedQA_infer_cfg = dict(
|
|||||||
inferencer=dict(type=GenInferencer),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
MedQA_eval_cfg = dict(
|
MedQA_subsets = {
|
||||||
evaluator=dict(
|
'US': 'xuxuxuxuxu/MedQA_US_test',
|
||||||
type=GenericLLMEvaluator,
|
'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
|
||||||
prompt_template=dict(
|
'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
|
||||||
type=PromptTemplate,
|
}
|
||||||
template=dict(
|
|
||||||
begin=[
|
|
||||||
dict(
|
|
||||||
role='SYSTEM',
|
|
||||||
fallback_role='HUMAN',
|
|
||||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
|
||||||
)
|
|
||||||
],
|
|
||||||
round=[
|
|
||||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
dataset_cfg=dict(
|
|
||||||
type=MedQADataset,
|
|
||||||
path='opencompass/MedQA',
|
|
||||||
reader_cfg=MedQA_reader_cfg,
|
|
||||||
),
|
|
||||||
judge_cfg=dict(),
|
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
MedQA_datasets.append(
|
for split in list(MedQA_subsets.keys()):
|
||||||
dict(
|
|
||||||
abbr=f'MedQA',
|
MedQA_eval_cfg = dict(
|
||||||
type=MedQADataset,
|
evaluator=dict(
|
||||||
path='opencompass/MedQA',
|
type=GenericLLMEvaluator,
|
||||||
reader_cfg=MedQA_reader_cfg,
|
prompt_template=dict(
|
||||||
infer_cfg=MedQA_infer_cfg,
|
type=PromptTemplate,
|
||||||
eval_cfg=MedQA_eval_cfg,
|
template=dict(
|
||||||
|
begin=[
|
||||||
|
dict(
|
||||||
|
role='SYSTEM',
|
||||||
|
fallback_role='HUMAN',
|
||||||
|
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
dataset_cfg=dict(
|
||||||
|
type=MedQADataset,
|
||||||
|
path=MedQA_subsets[split],
|
||||||
|
reader_cfg=MedQA_reader_cfg,
|
||||||
|
),
|
||||||
|
judge_cfg=dict(),
|
||||||
|
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
MedQA_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'MedQA_{split}',
|
||||||
|
type=MedQADataset,
|
||||||
|
path=MedQA_subsets[split],
|
||||||
|
reader_cfg=MedQA_reader_cfg,
|
||||||
|
infer_cfg=MedQA_infer_cfg,
|
||||||
|
eval_cfg=MedQA_eval_cfg,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
@ -1,4 +1,4 @@
|
|||||||
from datasets import Dataset, DatasetDict, load_dataset
|
from datasets import Dataset, load_dataset
|
||||||
|
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import LOAD_DATASET
|
||||||
|
|
||||||
@ -11,18 +11,13 @@ class MedQADataset(BaseDataset):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def load_single(path):
|
def load_single(path):
|
||||||
dataset = []
|
dataset = []
|
||||||
data_lines = load_dataset(path, 'test') # "data/MedQA"
|
ds = load_dataset(path)
|
||||||
num = 0
|
for data in ds['train']:
|
||||||
for data in data_lines:
|
data['label'] = data['answer_idx']
|
||||||
num += 1
|
|
||||||
choices = ''
|
choices = ''
|
||||||
for i in range(4):
|
for option in data['options']:
|
||||||
data[chr(65 + i)] = data['ending' + str(i)]
|
choices += option + '. ' + data['options'][option] + '\n'
|
||||||
choices += chr(65 + i) + '. ' + data['ending' + str(i)] + '\n'
|
|
||||||
data['question'] = data['sent1']
|
|
||||||
data['choices'] = choices
|
data['choices'] = choices
|
||||||
data['label'] = chr(65 + int(data['label'])) + '. ' + data[
|
|
||||||
'ending' + str(data['label'])]
|
|
||||||
|
|
||||||
dataset.append(data)
|
dataset.append(data)
|
||||||
|
|
||||||
@ -30,10 +25,5 @@ class MedQADataset(BaseDataset):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path):
|
def load(path):
|
||||||
train_dataset = Dataset.from_list([])
|
dataset = MedQADataset.load_single(path)
|
||||||
val_dataset = MedQADataset.load_single(path) # "data/MedQA/test.json"
|
|
||||||
dataset = DatasetDict({
|
|
||||||
'train': train_dataset,
|
|
||||||
'validation': val_dataset
|
|
||||||
})
|
|
||||||
return dataset
|
return dataset
|
||||||
|
Loading…
Reference in New Issue
Block a user