OpenCompass/opencompass/datasets/judge/rewardbench.py

# flake8: noqa
import json
import os.path as osp
import re

import numpy as np
import pandas as pd
from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
                                  LOAD_DATASET)
from opencompass.utils import get_data_path

from ..base import BaseDataset

prompt_choice_prefix = """
Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.

- Do not let the order of presentation, response length, or assistant names influence your judgment.
- Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.

Your final reply must be structured in the following format:
{
  "Choice": "[Model A or Model B]"
}
"""

prompt_choice_en = """User Question: {question}

Model A's Response: {answerA}

Model B's Response: {answerB}

Now it's your turn. Please provide selection result as required:
"""


@LOAD_DATASET.register_module()
class RewardBenchDataset(BaseDataset):

    def load(self, path: str, name: str, *args, **kwargs):

        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for item in data:
                conversation_a = item['chosen']
                conversation_b = item['rejected']
                model_a = item['chosen_model']
                model_b = item['rejected_model']
                question = item['prompt']
                winner = item['winner']
                if winner == 'B':
                    conversation_a, conversation_b = conversation_b, conversation_a
                    model_a, model_b = model_b, model_a
                subset = item['subset']
                prompt = prompt_choice_prefix + prompt_choice_en.format(
                    question=question,
                    answerA=conversation_a,
                    answerB=conversation_b)
                lan = 'en'
                raw_data.append({
                    'prompt': prompt,
                    'judge': {
                        'prompt': item['prompt'],
                        'Answer_A': conversation_a,
                        'Answer_B': conversation_b,
                        'subset': subset,
                        'winner': winner,
                        'model_a': model_a,
                        'model_b': model_b,
                        'dataset_name': 'rewardbench',
                        'lan': lan
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset