OpenCompass/opencompass/datasets/mmlu_cf.py

42 lines
1.3 KiB
Python
Raw Normal View History

from datasets import DatasetDict, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMLUCFDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
"""Loading HuggingFace datasets."""
# Use HuggingFace's load_dataset method to load the dataset
hf_dataset = load_dataset(path)
columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer']
hf_dataset = hf_dataset.map(
lambda x: {key: x[key]
for key in columns_to_keep})
splits = ['dev', 'val']
for split in splits:
sub_set = f'{name}_{split}'
# Rename fields here if they don't match the expected names
hf_dataset[sub_set] = hf_dataset[sub_set].map(
lambda example: {
'input': example['Question'],
'A': example['A'],
'B': example['B'],
'C': example['C'],
'D': example['D'],
'target': example['Answer']
})
# Create a DatasetDict and return it
dataset = DatasetDict({
'dev': hf_dataset[f'{name}_{splits[0]}'],
'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
})
return dataset