mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
89 lines
2.5 KiB
Python
89 lines
2.5 KiB
Python
# flake8: noqa: E501
|
||
import json
|
||
import os.path as osp
|
||
import re
|
||
from typing import Optional
|
||
|
||
from datasets import Dataset, DatasetDict
|
||
|
||
from opencompass.registry import LOAD_DATASET
|
||
|
||
from .subjective_cmp import SubjectiveCmpDataset
|
||
|
||
eng_base_prefix = """
|
||
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
|
||
|
||
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
|
||
|
||
Your judgment must strictly adhere to the following format:
|
||
Conclusion: [[Correct]]
|
||
Reasoning: xxx.
|
||
|
||
Conclusion: [[Wrong]]
|
||
Reasoning: xxx.
|
||
|
||
[Question Start]
|
||
{question}
|
||
[Question End]
|
||
|
||
[Reference Answers Start]
|
||
{ref}
|
||
[Reference Answers End]
|
||
|
||
[Model Response Start]
|
||
"""
|
||
|
||
chn_base_prefix = """
|
||
你是一个评判者,请你基于参考答案,公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论,然后再给出相应的理由。
|
||
请注意,由于参考答案是一个候选列表,因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
|
||
你的评判必须严格遵守以下格式:
|
||
结论:[[对]]
|
||
理由:xxx。
|
||
|
||
结论:[[错]]
|
||
理由:xxx。
|
||
|
||
[问题开始]
|
||
{question}
|
||
[问题结束]
|
||
|
||
[参考答案开始]
|
||
{ref}
|
||
[参考答案结束]
|
||
|
||
[模型回答开始]
|
||
"""
|
||
|
||
|
||
def prompt_construct(sample):
|
||
lan = sample['others']['lan']
|
||
question = sample['question']
|
||
if lan == 'zh':
|
||
prefix = chn_base_prefix.format(question=sample['question'],
|
||
ref=str(sample['others']['answers']))
|
||
suffix = '\n[模型回答结束]\n'
|
||
elif lan == 'en':
|
||
prefix = eng_base_prefix.format(question=sample['question'],
|
||
ref=str(sample['others']['answers']))
|
||
suffix = '\n[Model Response End]\n'
|
||
return prefix, suffix
|
||
|
||
|
||
@LOAD_DATASET.register_module()
|
||
class IRDataset(SubjectiveCmpDataset):
|
||
|
||
def load(
|
||
self,
|
||
path: str,
|
||
name: str,
|
||
):
|
||
dataset = list(super().load(path, name))
|
||
subject_dataset = []
|
||
for data in dataset:
|
||
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
|
||
data['judge']['others'] = data['others']
|
||
data['ref'] = str(data['others']['answers'])
|
||
subject_dataset.append(data)
|
||
dataset = Dataset.from_list(subject_dataset)
|
||
return dataset
|