OpenCompass/opencompass/datasets/base.py
Junnan Liu 73c80953c6
[Feature] Support Dataset Repeat and G-Pass Compute for Each Evaluator (#1886)
* support dataset repeat and g-pass compute for each evaluator

* fix pre-commit errors

* delete print

* delete gpassk_evaluator and fix potential errors

* change `repeat` to `n`

* fix `repeat` to `n` in openicl_eval

* update doc for multi-run and g-pass

* update latex equation in doc

* update eng doc for multi-run and g-pass

* update datasets.md

* update datasets.md

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation in zh_cn user_guides

* mmodify pre-commit-zh-cn

* recover pre-commit and edit math expr in doc

* del [TIP]

* del cite tag in doc

* del extract_model param in livemathbench config
2025-02-26 19:43:12 +08:00

60 lines
2.0 KiB
Python

from copy import deepcopy
from typing import Dict, List, Optional, Union
from datasets import Dataset, DatasetDict
from opencompass.openicl import DatasetReader
class BaseDataset:
def __init__(self,
reader_cfg: Optional[Dict] = {},
k: Union[int, List[int]] = 1,
n: int = 1,
**kwargs):
abbr = kwargs.pop('abbr', 'dataset')
dataset = self.load(**kwargs)
# maybe duplicate
assert (max(k) if isinstance(k, List) else
k) <= n, 'Maximum value of `k` must less than or equal to `n`'
if isinstance(dataset, Dataset):
examples = []
for idx, example in enumerate(dataset):
if 'subdivision' not in example:
example['subdivision'] = abbr
if 'idx' not in example:
example['idx'] = idx
examples.append(example)
examples = sum([deepcopy(examples) for _ in range(n)], [])
self.dataset = Dataset.from_list(examples)
else:
self.dataset = DatasetDict()
for key in dataset:
examples = []
for idx, example in enumerate(dataset[key]):
if 'subdivision' not in example:
example['subdivision'] = f'{abbr}_{key}'
if 'idx' not in example:
example['idx'] = idx
examples.append(example)
print(abbr, key, len(examples))
examples = sum([deepcopy(examples) for _ in range(n)], [])
self.dataset[key] = Dataset.from_list(examples)
self._init_reader(**reader_cfg)
def _init_reader(self, **kwargs):
self.reader = DatasetReader(self.dataset, **kwargs)
@property
def train(self):
return self.reader.dataset['train']
@property
def test(self):
return self.reader.dataset['test']
@staticmethod
def load(**kwargs) -> Union[Dataset, DatasetDict]:
pass