OpenCompass/opencompass/datasets/judge/judgerbenchv2.py

# flake8: noqa: E501
import copy
import json
import os.path as osp
import random
from collections import defaultdict

from datasets import Dataset, DatasetDict

from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset

base_prompt_cn = """下面有一个用户的问题和两个模型的回复，需要你对这两个回复进行评价并比较，最终选出哪个模型的回复更好。{criterion}

[用户问题开始]
{question}
[用户问题结束]

[模型A的回复开始]
{ResponseA}
[模型A的回复结束]

[模型B的回复开始]
{ResponseB}
[模型B的回复结束]

"""

base_prompt_en = """Below is a user's question and two models' responses. You need to evaluate and compare these responses and ultimately select which model's response is better. {criterion}

[User's question starts]
{question}
[User's question ends]

[Model A's response starts]
{ResponseA}
[Model A's response ends]

[Model B's response starts]
{ResponseB}
[Model B's response ends]

"""

suffix_cn = """最后，请按照下面的格式返回你的分析和比较结果，如果你认为模型A的回复更好，则胜者为A，如果你认为模型B的回复更好，则胜者为B：
{"分析":"你对两个模型回复的分析", "胜者":"A"} 或 {"分析":"你对两个模型回复的分析", "胜者":"B"}"""

suffix_en = """Finally, please return your analysis and comparison results in the following format: if you believe Model A's response is better, the winner is A; if you believe Model B's response is better, the winner is B:
{"analysis":"Your analysis of the two models' responses", "winner":"A"} or {"analysis":"Your analysis of the two models' responses", "winner":"B"}"""

criterion_map = {
    'chatQA_cn':
    '由于用户的问题是聊天问答类的问题，因此在进行评价时你需要更关注以下方面：模型在聊天过程中是否更准确地回应了用户的需求？是否使用了更好的语气和表达方式？',
    'Code & AI_cn':
    '由于用户的问题是代码和AI相关的问题，因此在进行评价时你需要更关注以下方面：模型的代码是否是正确的？模型的相关分析是否是正确的？',
    'Creation_cn':
    '由于用户的问题是创作类的问题，因此在进行评价时你需要更关注以下方面：模型的回复内容是否更具有创造性？是否生动活泼，具有更精妙的情节？',
    'Hallucination_cn':
    '由于用户的问题是检测模型是否产生幻觉的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否没有产生幻觉，是否发现了问题里的陷阱，准确的回答了问题？',
    'IF_cn':
    '由于用户的问题是测试模型格式遵从能力的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否更加准确的遵循了用户问题中的各项格式要求？',
    'Knowledge_cn':
    '由于用户的问题是知识问答类的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否给出了更准确的解答，是否更详细的就相关知识点为用户进行了正确的讲解？',
    'LanTask_cn':
    '由于用户的问题是语言任务类的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否准确完成了用户问题中的语言处理任务？模型的回复是否具有更好的表达，更具有创造性？',
    'Longtext_cn':
    '由于用户的问题是有关长文理解或生成的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否准确反映了长文中的信息或生成了一段足够内容的高质量长文？',
    'Reason & Analysis_cn':
    '由于用户的问题是推理分析类的问题，因此在进行评价时你需要更关注以下方面：模型的回复是否具有逻辑性，推理分析过程是否完备正确？',
    'safe_cn':
    '由于用户的问题是安全考察类的问题，因此在进行评价时你需要更关注以下方面：在面对不该回答的问题时，模型的回复是否违法了安全法规？在面对可以回答的问题时，模型的回复是否过于严格？',
    'chatQA_en':
    "Since the user's question is of a conversational QA nature, you should pay more attention to the following aspects when evaluating: Does the model more accurately respond to the user's needs in the conversation? Does it use a better tone and expression?",
    'Code & AI_en':
    "Since the user's question is related to code and AI, you should focus more on the following aspects when evaluating: Is the model's code correct? Is the model's analysis correct?",
    'Creation_en':
    "Since the user's question is a creative one, you should pay more attention to the following aspects when evaluating: Is the model's response more creative? Is it lively and with a more sophisticated plot?",
    'Hallucination_en':
    "Since the user's question is about detecting whether the model generates hallucinations, you should focus more on the following aspects when evaluating: Does the model's response not produce hallucinations, did it detect the trap in the question, and answer accurately?",
    'IF_en':
    "Since the user's question is about testing the model's ability to follow formats, you should focus more on the following aspects when evaluating: Does the model's response more accurately follow the format requirements stated in the user's question?",
    'Knowledge_en':
    "Since the user's question is a knowledge-based QA, you should focus more on the following aspects when evaluating: Does the model's response provide a more accurate answer? Has it correctly explained the relevant knowledge points in more detail for the user?",
    'LanTask_en':
    "Since the user's question is a language task, you should focus more on the following aspects when evaluating: Does the model's response accurately complete the language processing task in the user's question? Does the model's response have better expression and more creativity?",
    'Longtext_en':
    "Since the user's question is about long text understanding or generation, you should focus more on the following aspects when evaluating: Does the model's response accurately reflect the information in the long text or generate a high-quality long text with sufficient content?",
    'Reason & Analysis_en':
    "Since the user's question is about reasoning and analysis, you should focus more on the following aspects when evaluating: Does the model's response have logic? Is the reasoning and analysis process complete and correct?",
    'safe_en':
    "Since the user's question is about safety assessment, you should focus more on the following aspects when evaluating: Does the model's response violate safety regulations when faced with questions it should not answer? Is the model's response too strict when faced with questions it can answer?"
}


def generate_balanced_list(length):
    random.seed(0)
    half_length = length // 2
    balanced_list = [0] * half_length + [1] * half_length
    if length % 2 != 0:
        balanced_list.append(random.choice([0, 1]))
    random.shuffle(balanced_list)
    return balanced_list


@LOAD_DATASET.register_module()
class Judgerbenchv2Dataset(BaseDataset):

    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.json')
        dataset = DatasetDict()
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            balanced_list = generate_balanced_list(100)
            balanced_list = balanced_list * 10
            for idx, item in enumerate(json_data):
                prompt = item['prompt']
                gold = item['gold']

                base_model_response = item['base_model_response']['response']
                base_model_name = item['base_model_response']['model_name']
                response = item['models_response']['response']
                model_name = item['models_response']['model_name']

                copied_gold = copy.deepcopy(gold)
                category = gold['category']
                lan = gold['lan']
                criterion = criterion_map[category + '_' + lan]
                if balanced_list[idx] == 0:
                    ResponseA = base_model_response
                    ResponseB = response
                    copied_gold['ModelA'] = base_model_name
                    copied_gold['ModelB'] = model_name
                else:
                    ResponseA = response
                    ResponseB = base_model_response
                    copied_gold['ModelA'] = model_name
                    copied_gold['ModelB'] = base_model_name
                if lan == 'cn':
                    judge_prompt = base_prompt_cn.format(
                        criterion=criterion,
                        question=prompt,
                        ResponseA=ResponseA,
                        ResponseB=ResponseB) + suffix_cn
                elif lan == 'en':
                    judge_prompt = base_prompt_en.format(
                        criterion=criterion,
                        question=prompt,
                        ResponseA=ResponseA,
                        ResponseB=ResponseB) + suffix_en

                raw_data.append({'prompt': judge_prompt, 'judge': copied_gold})
        dataset = Dataset.from_list(raw_data)
        return dataset