2025-04-25 14:47:20 +08:00
|
|
|
from datasets import Dataset, DatasetDict, load_dataset
|
2025-04-24 00:04:21 +08:00
|
|
|
|
|
|
|
from opencompass.registry import LOAD_DATASET
|
|
|
|
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
|
|
class PubMedQADataset(BaseDataset):
|
|
|
|
|
|
|
|
@staticmethod
|
2025-04-25 14:47:20 +08:00
|
|
|
def load_single():
|
2025-04-24 00:04:21 +08:00
|
|
|
dataset = []
|
2025-04-25 14:47:20 +08:00
|
|
|
ds = load_dataset('qiaojin/PubMedQA', 'pqa_labeled')
|
|
|
|
for data in ds['train']:
|
|
|
|
data['question'] = (f"CONTEXTS: {data['context']}\n"
|
|
|
|
f"QUESTION: {data['question']}")
|
2025-04-24 00:04:21 +08:00
|
|
|
choices = 'A. yes\nB. no\nC. maybe'
|
|
|
|
data['choices'] = choices
|
|
|
|
if data['final_decision'] == 'yes':
|
|
|
|
data['label'] = 'A. yes'
|
|
|
|
elif data['final_decision'] == 'no':
|
|
|
|
data['label'] = 'B. no'
|
|
|
|
else:
|
|
|
|
data['label'] = 'C. maybe'
|
|
|
|
|
|
|
|
dataset.append(data)
|
|
|
|
|
|
|
|
return Dataset.from_list(dataset)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def load(path):
|
|
|
|
train_dataset = Dataset.from_list([])
|
2025-04-25 14:47:20 +08:00
|
|
|
val_dataset = PubMedQADataset.load_single()
|
2025-04-24 00:04:21 +08:00
|
|
|
dataset = DatasetDict({
|
|
|
|
'train': train_dataset,
|
|
|
|
'validation': val_dataset
|
|
|
|
})
|
|
|
|
return dataset
|