OpenCompass/opencompass/datasets/subjective/compassbench.py

103 lines
2.7 KiB
Python
Raw Normal View History

2024-05-30 00:21:58 +08:00
# flake8: noqa
import json
import os.path as osp
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。
[用户问题]
{question}
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
请先对两个回答进行评价最后在以下 3 个选项中做出选择:
2024-05-30 00:21:58 +08:00
A. 回答1更好
B. 回答2更好
C. 回答12平局
如果你认为回答1更好你的输出应形如
评价1回答1 xxx
评价2回答2 xxx
选择[[A]]
如果你认为回答2更好你的输出应形如
评价1回答1 xxx
评价2回答2 xxx
选择[[B]]
如果你认为回答12打成平手你的输出应形如
评价1回答1 xxx
评价2回答2 xxx
选择[[C]]
"""
base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
A. Response 1 is better
B. Response 2 is better
C. Both responses are equal
[user's question]
{question}
[Response 1 Start]
{prediction}
[Response 1 End]
[Response 2 Start]
{prediction2}
[Response 2 End]
If you believe that Response 1 is better, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[A]]
If you believe that Response 2 is better, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[B]]
If you believe that both responses are equally good, your output should be formatted as follows:
Evaluation 1: Response 1 xxx
Evaluation 2: Response 2 xxx
Choice: [[C]]
"""
@LOAD_DATASET.register_module()
class CompassBenchDataset(BaseDataset):
def load(self, path: str, name: str):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for problem in json_data:
question = problem['question']
lan = problem['language']
others = problem['others']
judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
judge_prompt = judge_prompt.replace('{question}', question)
2024-05-30 00:21:58 +08:00
raw_data.append({
'question': question,
'judge_prompt': judge_prompt,
'judge': {
'lan': lan,
'level': others['level'],
'category': problem['category'],
'question': question
}
})
dataset = Dataset.from_list(raw_data)
return dataset