OpenCompass/opencompass/datasets/judge/rewardbench.py
2025-04-21 09:00:52 +00:00

81 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# flake8: noqa
import json
import os.path as osp
import re
import numpy as np
import pandas as pd
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
LOAD_DATASET)
from opencompass.utils import get_data_path
from ..base import BaseDataset
prompt_choice_prefix = """
Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
- Do not let the order of presentation, response length, or assistant names influence your judgment.
- Base your decision solely on how well each response addresses the users question and adheres to the instructions.
Your final reply must be structured in the following format:
{
"Choice": "[Model A or Model B]"
}
"""
prompt_choice_en = """User Question: {question}
Model A's Response: {answerA}
Model B's Response: {answerB}
Now it's your turn. Please provide selection result as required:
"""
@LOAD_DATASET.register_module()
class RewardBenchDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data:
conversation_a = item['chosen']
conversation_b = item['rejected']
model_a = item['chosen_model']
model_b = item['rejected_model']
question = item['prompt']
winner = item['winner']
if winner == 'B':
conversation_a, conversation_b = conversation_b, conversation_a
model_a, model_b = model_b, model_a
subset = item['subset']
prompt = prompt_choice_prefix + prompt_choice_en.format(
question=question,
answerA=conversation_a,
answerB=conversation_b)
lan = 'en'
raw_data.append({
'prompt': prompt,
'judge': {
'prompt': item['prompt'],
'Answer_A': conversation_a,
'Answer_B': conversation_b,
'subset': subset,
'winner': winner,
'model_a': model_a,
'model_b': model_b,
'dataset_name': 'rewardbench',
'lan': lan
}
})
dataset = Dataset.from_list(raw_data)
return dataset