mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* TabMWP * TabMWP * fixed * fixed * fixed * done * done * done * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * modified to a more general way * modified to a more general way * final * final * add summarizer * add new summarize * fixed * fixed * fixed --------- Co-authored-by: caomaosong <caomaosong@pjlab.org.cn>
117 lines
4.0 KiB
Python
117 lines
4.0 KiB
Python
# flake8: noqa: E501
|
|
import json
|
|
import random
|
|
|
|
from datasets import Dataset, DatasetDict
|
|
|
|
from opencompass.registry import LOAD_DATASET
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class SubInferDataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(path: str):
|
|
dataset = DatasetDict()
|
|
raw_data = []
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
json_data = json.load(f)
|
|
for problem in json_data:
|
|
question = problem['question']
|
|
reference_answer = problem['reference_answer']
|
|
evaluating_guidance = problem['evaluating_guidance']
|
|
capability = problem['capability']
|
|
raw_data.append({
|
|
'question': question,
|
|
'judge': {
|
|
'question': question,
|
|
'reference_answer': reference_answer,
|
|
'evaluating_guidance': evaluating_guidance,
|
|
'capability': capability
|
|
}
|
|
})
|
|
dataset = Dataset.from_list(raw_data)
|
|
return dataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class SubJudgeDataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(
|
|
path: str,
|
|
model1: str,
|
|
path2: str,
|
|
model2: str,
|
|
mode='compare',
|
|
random_order=True,
|
|
random_seed=0,
|
|
):
|
|
dataset = DatasetDict()
|
|
raw_data = []
|
|
if mode == 'compare':
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
json_data1 = json.load(f)
|
|
with open(path2, 'r', encoding='utf-8') as f:
|
|
json_data2 = json.load(f)
|
|
random_generate = random.Random(random_seed)
|
|
same_flag = 0
|
|
for idx in json_data1:
|
|
problem = json_data1[idx]
|
|
answer1 = json_data1[idx]['prediction']
|
|
answer2 = json_data2[idx]['prediction']
|
|
if answer1 == answer2:
|
|
same_flag += 1
|
|
continue
|
|
item = {}
|
|
item['question'] = problem['gold']['question']
|
|
item['reference_answer'] = problem['gold']['reference_answer']
|
|
item['evaluating_guidance'] = problem['gold'][
|
|
'evaluating_guidance']
|
|
item['capability'] = problem['gold']['capability']
|
|
if random_order:
|
|
if random_generate.randint(0, 1) == 0:
|
|
item['answer1'] = answer1
|
|
item['model1'] = model1
|
|
item['answer2'] = answer2
|
|
item['model2'] = model2
|
|
else:
|
|
item['answer1'] = answer2
|
|
item['model1'] = model2
|
|
item['answer2'] = answer1
|
|
item['model2'] = model1
|
|
else:
|
|
item['answer1'] = answer1
|
|
item['model1'] = model1
|
|
item['answer2'] = answer2
|
|
item['model2'] = model2
|
|
raw_data.append({
|
|
'question':
|
|
item['question'],
|
|
'reference_answer':
|
|
item['reference_answer'],
|
|
'evaluating_guidance':
|
|
item['evaluating_guidance'],
|
|
'capability':
|
|
item['capability'],
|
|
'answer1':
|
|
item['answer1'],
|
|
'answer2':
|
|
item['answer2'],
|
|
'judge': {
|
|
'capability': item['capability'],
|
|
'model1': item['model1'],
|
|
'model2': item['model2']
|
|
}
|
|
})
|
|
if same_flag != 0:
|
|
print(
|
|
f'Among {len(json_data1)} comparisons, {same_flag} cases are exact match, which will be skipped. '
|
|
)
|
|
elif mode == 'score':
|
|
pass
|
|
dataset = Dataset.from_list(raw_data)
|
|
return dataset
|