OpenCompass/opencompass/datasets/subject_ir.py
bittersweet1999 2163f9398f
[Feature] add subject ir dataset (#755)
* add subject ir

* Add ir dataset

* Add ir dataset
2024-01-05 12:00:57 +00:00

89 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# flake8: noqa: E501
import json
import os.path as osp
import re
from typing import Optional
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from .subjective_cmp import SubjectiveCmpDataset
eng_base_prefix = """
You are a judger. Please impartially judge whether an AI model's response to a question is correct based on the reference answers. You need to provide a conclusion of "correct" or "wrong," followed by the corresponding reasoning.
Note that since the reference answer is a candidate list, the AI model's response only needs to align with one item in the list to be deemed "correct."
Your judgment must strictly adhere to the following format:
Conclusion: [[Correct]]
Reasoning: xxx.
Conclusion: [[Wrong]]
Reasoning: xxx.
[Question Start]
{question}
[Question End]
[Reference Answers Start]
{ref}
[Reference Answers End]
[Model Response Start]
"""
chn_base_prefix = """
你是一个评判者请你基于参考答案公正地评判一个AI模型对于问题的回答是否正确。你需要给出“对或错”的结论然后再给出相应的理由。
请注意由于参考答案是一个候选列表因此AI模型的回答只要符合列表中的某一项即可判断为“对”。
你的评判必须严格遵守以下格式:
结论:[[对]]
理由xxx。
结论:[[错]]
理由xxx。
[问题开始]
{question}
[问题结束]
[参考答案开始]
{ref}
[参考答案结束]
[模型回答开始]
"""
def prompt_construct(sample):
lan = sample['others']['lan']
question = sample['question']
if lan == 'zh':
prefix = chn_base_prefix.format(question=sample['question'],
ref=str(sample['others']['answers']))
suffix = '\n[模型回答结束]\n'
elif lan == 'en':
prefix = eng_base_prefix.format(question=sample['question'],
ref=str(sample['others']['answers']))
suffix = '\n[Model Response End]\n'
return prefix, suffix
@LOAD_DATASET.register_module()
class IRDataset(SubjectiveCmpDataset):
def load(
self,
path: str,
name: str,
):
dataset = list(super().load(path, name))
subject_dataset = []
for data in dataset:
data['gpt4_prefix'], data['gpt4_suffix'] = prompt_construct(data)
data['judge']['others'] = data['others']
data['ref'] = str(data['others']['answers'])
subject_dataset.append(data)
dataset = Dataset.from_list(subject_dataset)
return dataset