OpenCompass/opencompass/datasets/mmlu_cf.py

from datasets import DatasetDict, load_dataset

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class MMLUCFDataset(BaseDataset):

    @staticmethod
    def load(path: str, name: str):
        """Loading HuggingFace datasets."""
        # Use HuggingFace's load_dataset method to load the dataset
        hf_dataset = load_dataset(path)
        columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer']
        hf_dataset = hf_dataset.map(
            lambda x: {key: x[key]
                       for key in columns_to_keep})
        splits = ['dev', 'val']

        for split in splits:
            sub_set = f'{name}_{split}'

            # Rename fields here if they don't match the expected names
            hf_dataset[sub_set] = hf_dataset[sub_set].map(
                lambda example: {
                    'input': example['Question'],
                    'A': example['A'],
                    'B': example['B'],
                    'C': example['C'],
                    'D': example['D'],
                    'target': example['Answer']
                })

        # Create a DatasetDict and return it
        dataset = DatasetDict({
            'dev': hf_dataset[f'{name}_{splits[0]}'],
            'test': hf_dataset[f'{name}_{splits[1]}']  # Use 'val' as 'test'
        })
        return dataset