mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
81 lines
2.8 KiB
Python
81 lines
2.8 KiB
Python
![]() |
import json
|
||
|
|
||
|
from datasets import Dataset
|
||
|
|
||
|
from opencompass.registry import LOAD_DATASET
|
||
|
|
||
|
from .base import BaseDataset
|
||
|
|
||
|
|
||
|
@LOAD_DATASET.register_module()
|
||
|
class C3Dataset(BaseDataset):
|
||
|
|
||
|
@staticmethod
|
||
|
def load(path: str):
|
||
|
|
||
|
with open(path) as f:
|
||
|
data = json.load(f)
|
||
|
rows = []
|
||
|
for _, row in enumerate(data):
|
||
|
content = row[0]
|
||
|
content_str = ' '.join(
|
||
|
[''.join(paragraph) for paragraph in content])
|
||
|
|
||
|
for question in row[1]:
|
||
|
label = question['choice'].index(question['answer'])
|
||
|
length = len(question['choice'])
|
||
|
if length < 4:
|
||
|
fill_value = question['choice'][0] # 以第一个值为填充值
|
||
|
fill_count = 4 - length # 需要填充的数量
|
||
|
question['choice'] += [fill_value] * fill_count # 填充
|
||
|
|
||
|
rows.append({
|
||
|
'content': content_str,
|
||
|
'question': question['question'],
|
||
|
'choices': question['choice'],
|
||
|
'choice0': question['choice'][0],
|
||
|
'choice1': question['choice'][1],
|
||
|
'choice2': question['choice'][2],
|
||
|
'choice3': question['choice'][3],
|
||
|
'label': label
|
||
|
})
|
||
|
|
||
|
dataset = Dataset.from_dict({
|
||
|
'content': [row['content'] for row in rows],
|
||
|
'question': [row['question'] for row in rows],
|
||
|
'choice0': [row['choice0'] for row in rows],
|
||
|
'choice1': [row['choice1'] for row in rows],
|
||
|
'choice2': [row['choice2'] for row in rows],
|
||
|
'choice3': [row['choice3'] for row in rows],
|
||
|
'choices': [row['choices'] for row in rows],
|
||
|
'label': [row['label'] for row in rows]
|
||
|
})
|
||
|
return dataset
|
||
|
|
||
|
|
||
|
@LOAD_DATASET.register_module()
|
||
|
class C3Dataset_V2(BaseDataset):
|
||
|
|
||
|
@staticmethod
|
||
|
def load(path: str):
|
||
|
with open(path) as f:
|
||
|
raw = json.load(f)
|
||
|
data = []
|
||
|
for line in raw:
|
||
|
content = ''.join([''.join(paragraph) for paragraph in line[0]])
|
||
|
for question in line[1]:
|
||
|
label = question['choice'].index(question['answer'])
|
||
|
label = 'ABCD'[label]
|
||
|
while len(question['choice']) < 4:
|
||
|
question['choice'].append('[NULL]')
|
||
|
data.append({
|
||
|
'content': content,
|
||
|
'question': question['question'],
|
||
|
'choice0': question['choice'][0],
|
||
|
'choice1': question['choice'][1],
|
||
|
'choice2': question['choice'][2],
|
||
|
'choice3': question['choice'][3],
|
||
|
'label': label
|
||
|
})
|
||
|
return Dataset.from_list(data)
|