OpenCompass/opencompass/datasets/subject.py
bittersweet1999 1c95790fdd
New subjective judgement (#660)
* TabMWP

* TabMWP

* fixed

* fixed

* fixed

* done

* done

* done

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* modified to a more general way

* modified to a more general way

* final

* final

* add summarizer

* add new summarize

* fixed

* fixed

* fixed

---------

Co-authored-by: caomaosong <caomaosong@pjlab.org.cn>
2023-12-06 13:28:33 +08:00

117 lines
4.0 KiB
Python

# flake8: noqa: E501
import json
import random
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class SubInferDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
raw_data = []
with open(path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for problem in json_data:
question = problem['question']
reference_answer = problem['reference_answer']
evaluating_guidance = problem['evaluating_guidance']
capability = problem['capability']
raw_data.append({
'question': question,
'judge': {
'question': question,
'reference_answer': reference_answer,
'evaluating_guidance': evaluating_guidance,
'capability': capability
}
})
dataset = Dataset.from_list(raw_data)
return dataset
@LOAD_DATASET.register_module()
class SubJudgeDataset(BaseDataset):
@staticmethod
def load(
path: str,
model1: str,
path2: str,
model2: str,
mode='compare',
random_order=True,
random_seed=0,
):
dataset = DatasetDict()
raw_data = []
if mode == 'compare':
with open(path, 'r', encoding='utf-8') as f:
json_data1 = json.load(f)
with open(path2, 'r', encoding='utf-8') as f:
json_data2 = json.load(f)
random_generate = random.Random(random_seed)
same_flag = 0
for idx in json_data1:
problem = json_data1[idx]
answer1 = json_data1[idx]['prediction']
answer2 = json_data2[idx]['prediction']
if answer1 == answer2:
same_flag += 1
continue
item = {}
item['question'] = problem['gold']['question']
item['reference_answer'] = problem['gold']['reference_answer']
item['evaluating_guidance'] = problem['gold'][
'evaluating_guidance']
item['capability'] = problem['gold']['capability']
if random_order:
if random_generate.randint(0, 1) == 0:
item['answer1'] = answer1
item['model1'] = model1
item['answer2'] = answer2
item['model2'] = model2
else:
item['answer1'] = answer2
item['model1'] = model2
item['answer2'] = answer1
item['model2'] = model1
else:
item['answer1'] = answer1
item['model1'] = model1
item['answer2'] = answer2
item['model2'] = model2
raw_data.append({
'question':
item['question'],
'reference_answer':
item['reference_answer'],
'evaluating_guidance':
item['evaluating_guidance'],
'capability':
item['capability'],
'answer1':
item['answer1'],
'answer2':
item['answer2'],
'judge': {
'capability': item['capability'],
'model1': item['model1'],
'model2': item['model2']
}
})
if same_flag != 0:
print(
f'Among {len(json_data1)} comparisons, {same_flag} cases are exact match, which will be skipped. '
)
elif mode == 'score':
pass
dataset = Dataset.from_list(raw_data)
return dataset