mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
81 lines
2.8 KiB
Python
81 lines
2.8 KiB
Python
# flake8: noqa
|
||
import json
|
||
import os.path as osp
|
||
import re
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from datasets import Dataset
|
||
|
||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||
from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
|
||
LOAD_DATASET)
|
||
from opencompass.utils import get_data_path
|
||
|
||
from ..base import BaseDataset
|
||
|
||
prompt_choice_prefix = """
|
||
Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
|
||
|
||
- Do not let the order of presentation, response length, or assistant names influence your judgment.
|
||
- Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
|
||
|
||
Your final reply must be structured in the following format:
|
||
{
|
||
"Choice": "[Model A or Model B]"
|
||
}
|
||
"""
|
||
|
||
prompt_choice_en = """User Question: {question}
|
||
|
||
Model A's Response: {answerA}
|
||
|
||
Model B's Response: {answerB}
|
||
|
||
Now it's your turn. Please provide selection result as required:
|
||
"""
|
||
|
||
|
||
@LOAD_DATASET.register_module()
|
||
class RewardBenchDataset(BaseDataset):
|
||
|
||
def load(self, path: str, name: str, *args, **kwargs):
|
||
|
||
path = get_data_path(path, local_mode=True)
|
||
filename = osp.join(path, f'{name}')
|
||
raw_data = []
|
||
with open(filename, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
for item in data:
|
||
conversation_a = item['chosen']
|
||
conversation_b = item['rejected']
|
||
model_a = item['chosen_model']
|
||
model_b = item['rejected_model']
|
||
question = item['prompt']
|
||
winner = item['winner']
|
||
if winner == 'B':
|
||
conversation_a, conversation_b = conversation_b, conversation_a
|
||
model_a, model_b = model_b, model_a
|
||
subset = item['subset']
|
||
prompt = prompt_choice_prefix + prompt_choice_en.format(
|
||
question=question,
|
||
answerA=conversation_a,
|
||
answerB=conversation_b)
|
||
lan = 'en'
|
||
raw_data.append({
|
||
'prompt': prompt,
|
||
'judge': {
|
||
'prompt': item['prompt'],
|
||
'Answer_A': conversation_a,
|
||
'Answer_B': conversation_b,
|
||
'subset': subset,
|
||
'winner': winner,
|
||
'model_a': model_a,
|
||
'model_b': model_b,
|
||
'dataset_name': 'rewardbench',
|
||
'lan': lan
|
||
}
|
||
})
|
||
dataset = Dataset.from_list(raw_data)
|
||
return dataset
|