OpenCompass/opencompass/datasets/SciEval.py
tcheng 3d1760aba2
[Dataset] Add Scieval (#2089)
* style: pass all formatting hooks (yapf & quote fixer)

* revise name:Add Lifescience Sub-set Support for MMLU & SciEval (datasets + configs + loader)

* revise name:Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* all categories of SciEval (datasets + configs + loader+dataset-index.yml)

* revise name:Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* revise :SciEval 5shot

---------

Co-authored-by: root <tangcheng231@mails.ucas.edu.cn>
2025-05-14 10:25:03 +08:00

69 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from typing import List
from datasets import Dataset, DatasetDict, load_dataset
from opencompass.datasets.base import BaseDataset
from opencompass.registry import LOAD_DATASET
# 预编译的多选题正则,按 PEP-8 每行 < 79 字符
_PATTERN_MC = (
r'^(?P<stem>.*?)' # 题干
r'(?:A\.)\s*(?P<A>.*?)\s*' # 选项 A
r'B\.\s*(?P<B>.*?)\s*' # 选项 B
r'C\.\s*(?P<C>.*?)\s*' # 选项 C
r'D\.\s*(?P<D>.*?)' # 选项 D
r'Answer:' # 答案分隔符
)
@LOAD_DATASET.register_module()
class SciEvalDataset(BaseDataset):
"""多选题子集,支持所有类别(可选指定 category 过滤)"""
@staticmethod
def load(path: str, name: str, **kwargs) -> DatasetDict:
# 如果传入 category则仅保留该类别否则包含所有类别
category = kwargs.get('category')
dataset: DatasetDict = DatasetDict()
for split in ('test', ):
raw_iter = load_dataset(
path,
name=name,
split=split,
streaming=True,
)
examples: List[dict] = []
for ex in raw_iter:
# 仅保留多选题
if ex.get('type') != 'multiple-choice':
continue
# 如指定了 category则进行过滤
if category is not None \
and ex.get('category') != category:
continue
ans_list = (ex.get('answer') or ex.get('answers') or [])
if not ans_list:
continue
target = ans_list[0]
match = re.search(_PATTERN_MC, ex.get('question', ''), re.S)
if not match:
continue
examples.append({
'input': match.group('stem').strip(),
'A': match.group('A').strip(),
'B': match.group('B').strip(),
'C': match.group('C').strip(),
'D': match.group('D').strip(),
'target': target,
})
dataset[split] = Dataset.from_list(examples)
return dataset