mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
81 lines
2.8 KiB
Python
81 lines
2.8 KiB
Python
![]() |
# flake8: noqa
|
|||
|
import json
|
|||
|
import os.path as osp
|
|||
|
import re
|
|||
|
|
|||
|
import numpy as np
|
|||
|
import pandas as pd
|
|||
|
from datasets import Dataset
|
|||
|
|
|||
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|||
|
from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
|
|||
|
LOAD_DATASET)
|
|||
|
from opencompass.utils import get_data_path
|
|||
|
|
|||
|
from ..base import BaseDataset
|
|||
|
|
|||
|
prompt_choice_prefix = """
|
|||
|
Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
|
|||
|
|
|||
|
- Do not let the order of presentation, response length, or assistant names influence your judgment.
|
|||
|
- Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
|
|||
|
|
|||
|
Your final reply must be structured in the following format:
|
|||
|
{
|
|||
|
"Choice": "[Model A or Model B]"
|
|||
|
}
|
|||
|
"""
|
|||
|
|
|||
|
prompt_choice_en = """User Question: {question}
|
|||
|
|
|||
|
Model A's Response: {answerA}
|
|||
|
|
|||
|
Model B's Response: {answerB}
|
|||
|
|
|||
|
Now it's your turn. Please provide selection result as required:
|
|||
|
"""
|
|||
|
|
|||
|
|
|||
|
@LOAD_DATASET.register_module()
|
|||
|
class RewardBenchDataset(BaseDataset):
|
|||
|
|
|||
|
def load(self, path: str, name: str, *args, **kwargs):
|
|||
|
|
|||
|
path = get_data_path(path, local_mode=True)
|
|||
|
filename = osp.join(path, f'{name}')
|
|||
|
raw_data = []
|
|||
|
with open(filename, 'r', encoding='utf-8') as f:
|
|||
|
data = json.load(f)
|
|||
|
for item in data:
|
|||
|
conversation_a = item['chosen']
|
|||
|
conversation_b = item['rejected']
|
|||
|
model_a = item['chosen_model']
|
|||
|
model_b = item['rejected_model']
|
|||
|
question = item['prompt']
|
|||
|
winner = item['winner']
|
|||
|
if winner == 'B':
|
|||
|
conversation_a, conversation_b = conversation_b, conversation_a
|
|||
|
model_a, model_b = model_b, model_a
|
|||
|
subset = item['subset']
|
|||
|
prompt = prompt_choice_prefix + prompt_choice_en.format(
|
|||
|
question=question,
|
|||
|
answerA=conversation_a,
|
|||
|
answerB=conversation_b)
|
|||
|
lan = 'en'
|
|||
|
raw_data.append({
|
|||
|
'prompt': prompt,
|
|||
|
'judge': {
|
|||
|
'prompt': item['prompt'],
|
|||
|
'Answer_A': conversation_a,
|
|||
|
'Answer_B': conversation_b,
|
|||
|
'subset': subset,
|
|||
|
'winner': winner,
|
|||
|
'model_a': model_a,
|
|||
|
'model_b': model_b,
|
|||
|
'dataset_name': 'rewardbench',
|
|||
|
'lan': lan
|
|||
|
}
|
|||
|
})
|
|||
|
dataset = Dataset.from_list(raw_data)
|
|||
|
return dataset
|