mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* support dataset repeat and g-pass compute for each evaluator * fix pre-commit errors * delete print * delete gpassk_evaluator and fix potential errors * change `repeat` to `n` * fix `repeat` to `n` in openicl_eval * update doc for multi-run and g-pass * update latex equation in doc * update eng doc for multi-run and g-pass * update datasets.md * update datasets.md * fix multi-line equation * fix multi-line equation * fix multi-line equation * fix multi-line equation * fix multi-line equation * fix multi-line equation * fix multi-line equation in zh_cn user_guides * mmodify pre-commit-zh-cn * recover pre-commit and edit math expr in doc * del [TIP] * del cite tag in doc * del extract_model param in livemathbench config
60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
from copy import deepcopy
|
|
from typing import Dict, List, Optional, Union
|
|
|
|
from datasets import Dataset, DatasetDict
|
|
|
|
from opencompass.openicl import DatasetReader
|
|
|
|
|
|
class BaseDataset:
|
|
|
|
def __init__(self,
|
|
reader_cfg: Optional[Dict] = {},
|
|
k: Union[int, List[int]] = 1,
|
|
n: int = 1,
|
|
**kwargs):
|
|
abbr = kwargs.pop('abbr', 'dataset')
|
|
dataset = self.load(**kwargs)
|
|
# maybe duplicate
|
|
assert (max(k) if isinstance(k, List) else
|
|
k) <= n, 'Maximum value of `k` must less than or equal to `n`'
|
|
if isinstance(dataset, Dataset):
|
|
examples = []
|
|
for idx, example in enumerate(dataset):
|
|
if 'subdivision' not in example:
|
|
example['subdivision'] = abbr
|
|
if 'idx' not in example:
|
|
example['idx'] = idx
|
|
examples.append(example)
|
|
examples = sum([deepcopy(examples) for _ in range(n)], [])
|
|
self.dataset = Dataset.from_list(examples)
|
|
else:
|
|
self.dataset = DatasetDict()
|
|
for key in dataset:
|
|
examples = []
|
|
for idx, example in enumerate(dataset[key]):
|
|
if 'subdivision' not in example:
|
|
example['subdivision'] = f'{abbr}_{key}'
|
|
if 'idx' not in example:
|
|
example['idx'] = idx
|
|
examples.append(example)
|
|
print(abbr, key, len(examples))
|
|
examples = sum([deepcopy(examples) for _ in range(n)], [])
|
|
self.dataset[key] = Dataset.from_list(examples)
|
|
self._init_reader(**reader_cfg)
|
|
|
|
def _init_reader(self, **kwargs):
|
|
self.reader = DatasetReader(self.dataset, **kwargs)
|
|
|
|
@property
|
|
def train(self):
|
|
return self.reader.dataset['train']
|
|
|
|
@property
|
|
def test(self):
|
|
return self.reader.dataset['test']
|
|
|
|
@staticmethod
|
|
def load(**kwargs) -> Union[Dataset, DatasetDict]:
|
|
pass
|