OpenCompass/opencompass/datasets/mmmlu.py
Bob Tsang dd0b655bd0
[Feature] Support MMMLU & MMMLU-lite Benchmark (#1565)
* rm folder

* modify format according to reviewer

* modify format according to reviewer

* modify format according to reviewer

* add some files requirement

* fix some bug

* fix bug

* change load type

* Update MMMLU Dataset

* Update MMMLU Dataset

* Add MMMLU-Lite Dataset

* update MMMMLU datast

* update MMMMLU datast

* update MMMMLU datast

---------

Co-authored-by: BobTsang <BobTsang1995@gmail.com>
Co-authored-by: liushz <qq1791167085@163.com>
2024-10-17 19:09:34 +08:00

53 lines
1.5 KiB
Python

# flake8: noqa
# yapf: disable
import json
import os
from datasets import Dataset, DatasetDict, load_dataset
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMMLUDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
subset = name.split('_')[1].replace('-', '_')
for split in ['test']:
data = load_dataset(path=path,
name=subset,
split=split,
trust_remote_code=True)
dataset_list = []
for item in data:
dataset_list.append({
'input': item['Question'],
'A': item['A'],
'B': item['B'],
'C': item['C'],
'D': item['D'],
'target': item['Answer'],
'subject': item['Subject'].replace('_', ' ')
})
dataset[split] = Dataset.from_list(dataset_list)
return dataset
@LOAD_DATASET.register_module()
class MMMLULiteDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
path = os.path.join(path, name + '.jsonl')
dataset_list = []
with open(path, 'r') as f:
dataset_list = [json.loads(line) for line in f.readlines()]
dataset['test'] = Dataset.from_list(dataset_list)
return dataset