# flake8: noqa import json import os.path as osp import re import numpy as np import pandas as pd from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS, LOAD_DATASET) from opencompass.utils import get_data_path from ..base import BaseDataset prompt_choice_prefix = """ Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail. - Do not let the order of presentation, response length, or assistant names influence your judgment. - Base your decision solely on how well each response addresses the user’s question and adheres to the instructions. Your final reply must be structured in the following format: { "Choice": "[Model A or Model B]" } """ prompt_choice_en = """User Question: {question} Model A's Response: {answerA} Model B's Response: {answerB} Now it's your turn. Please provide selection result as required: """ @LOAD_DATASET.register_module() class RewardBenchDataset(BaseDataset): def load(self, path: str, name: str, *args, **kwargs): path = get_data_path(path, local_mode=True) filename = osp.join(path, f'{name}') raw_data = [] with open(filename, 'r', encoding='utf-8') as f: data = json.load(f) for item in data: conversation_a = item['chosen'] conversation_b = item['rejected'] model_a = item['chosen_model'] model_b = item['rejected_model'] question = item['prompt'] winner = item['winner'] if winner == 'B': conversation_a, conversation_b = conversation_b, conversation_a model_a, model_b = model_b, model_a subset = item['subset'] prompt = prompt_choice_prefix + prompt_choice_en.format( question=question, answerA=conversation_a, answerB=conversation_b) lan = 'en' raw_data.append({ 'prompt': prompt, 'judge': { 'prompt': item['prompt'], 'Answer_A': conversation_a, 'Answer_B': conversation_b, 'subset': subset, 'winner': winner, 'model_a': model_a, 'model_b': model_b, 'dataset_name': 'rewardbench', 'lan': lan } }) dataset = Dataset.from_list(raw_data) return dataset