mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
438 lines
13 KiB
Python
438 lines
13 KiB
Python
# flake8: noqa: F401, F403
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from datasets import Dataset, DatasetDict
|
|
|
|
from opencompass.datasets.subjective.compass_arena_subjective_bench import \
|
|
get_element_counts
|
|
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
from ..base import BaseDataset
|
|
from .utils import get_judgeanswer_and_reference
|
|
|
|
score_prompt = """# Instruction
|
|
|
|
You are an expert evaluator. Your task is to evaluate the quality of \
|
|
the responses generated by AI models.
|
|
We will provide you with the user query and an AI-generated responses.
|
|
You should first read the user query and the conversation history \
|
|
carefully for analyzing the task, and then evaluate the quality of \
|
|
the responses based on and rules provided below.
|
|
|
|
# Conversation between User and AI
|
|
|
|
## History
|
|
<|begin_of_history|>
|
|
|
|
{history}
|
|
|
|
<|end_of_history|>
|
|
|
|
## Current User Query
|
|
<|begin_of_query|>
|
|
|
|
{user_query}
|
|
|
|
<|end_of_query|>
|
|
|
|
## AI Response
|
|
<|begin_of_response|>
|
|
|
|
{prediction}
|
|
|
|
<|end_of_response|>
|
|
|
|
|
|
# Evaluation
|
|
|
|
## Checklist
|
|
|
|
<|begin_of_checklist|>
|
|
|
|
{checklist}
|
|
|
|
<|end_of_checklist|>
|
|
|
|
Please use this checklist to guide your evaluation, but do \
|
|
not limit your assessment to the checklist.
|
|
|
|
## Rules
|
|
|
|
You should compare the above response based on your analysis\
|
|
of the user queries and the conversation history.
|
|
You should first write down your analysis and the checklist \
|
|
that you used for the evaluation, and then provide your \
|
|
assessment according to the checklist.
|
|
The scores are in the range of 1~10, where 1 means the \
|
|
response is very poor and 10 means the response is perfect.
|
|
Here are more detailed criteria for the scores:
|
|
|
|
- Score 1~2: The response is very poor and does not make sense at all.
|
|
- Score 3~4: The response is poor and does help user solve the problem\
|
|
in a meaningful way.
|
|
- Score 5~6: The response is fair but has some issues (e.g., factual \
|
|
errors, hallucinations, missing key information).
|
|
- Score 7~8: The response is good enough but could be improved in some ways.
|
|
- Score 9~10: The response is perfect and provides helpful information that\
|
|
can help user solve the problem.
|
|
|
|
## Output Format
|
|
First, please output your analysis for the model response, and then summarize\
|
|
your assessment to two aspects: "strengths" and "weaknesses"; Finally, please\
|
|
write down your rating for the assessment.
|
|
|
|
Please provide your evaluation results in the following json format by filling\
|
|
in the placeholders in []:
|
|
```
|
|
{
|
|
"strengths": "[analysis for the strengths of the response]",
|
|
"weaknesses": "[analysis for the weaknesses of the response]",
|
|
"score": "[1~10]"
|
|
}
|
|
```"""
|
|
|
|
pair_prompt = """# Instruction
|
|
|
|
You are an expert evaluator. Your task is to evaluate the quality of the \
|
|
responses generated by two AI models.
|
|
We will provide you with the user query and a pair of AI-generated \
|
|
responses (Response A and Response B).
|
|
You should first read the user query and the conversation history \
|
|
carefully for analyzing the task, and then evaluate the quality of the \
|
|
responses based on and rules provided below.
|
|
|
|
# Conversation between User and AI
|
|
|
|
## History
|
|
<|begin_of_history|>
|
|
|
|
{history}
|
|
|
|
<|end_of_history|>
|
|
|
|
## Current User Query
|
|
<|begin_of_query|>
|
|
|
|
{user_query}
|
|
|
|
<|end_of_query|>
|
|
|
|
## Response A
|
|
<|begin_of_response_A|>
|
|
|
|
{prediction}
|
|
|
|
<|end_of_response_A|>
|
|
|
|
## Response B
|
|
<|begin_of_response_B|>
|
|
|
|
{prediction2}
|
|
|
|
<|end_of_response_B|>
|
|
|
|
# Evaluation
|
|
|
|
## Checklist
|
|
|
|
<|begin_of_checklist|>
|
|
|
|
{checklist}
|
|
|
|
<|end_of_checklist|>
|
|
|
|
Please use this checklist to guide your evaluation, but do not limit your \
|
|
assessment to the checklist.
|
|
|
|
## Rules
|
|
|
|
You should compare the above two responses based on your analysis of the \
|
|
user queries and the conversation history.
|
|
You should first write down your analysis and the checklist that you used \
|
|
for the evaluation, and then provide your assessment according to the \
|
|
checklist.
|
|
There are five choices to give your final assessment: ["A++", "A+", \
|
|
"A=B", "B+", "B++"], which correspond to the following meanings:
|
|
|
|
- `A++`: Response A is much better than Response B.
|
|
- `A+`: Response A is only slightly better than Response B.
|
|
- `A=B`: Response A and B are of the same quality. Please use this \
|
|
choice sparingly.
|
|
- `B+`: Response B is only slightly better than Response A.
|
|
- `B++`: Response B is much better than Response A.
|
|
|
|
|
|
## Output Format
|
|
First, please output your analysis for each model response, and \
|
|
then summarize your assessment to three aspects: "reason A=B", \
|
|
"reason A>B", and "reason B>A", and finally make your choice for \
|
|
the final assessment.
|
|
|
|
Please provide your evaluation results in the following json \
|
|
format by filling in the placeholders in []:
|
|
```
|
|
{
|
|
"analysis of A": "[analysis of Response A]",
|
|
"analysis of B": "[analysis of Response B]",
|
|
"reason of A=B": "[where Response A and B perform equally well]",
|
|
"reason of A>B": "[where Response A is better than Response B]",
|
|
"reason of B>A": "[where Response B is better than Response A]",
|
|
"choice": "[A++ or A+ or A=B or B+ or B++]",
|
|
}
|
|
```
|
|
"""
|
|
|
|
|
|
def parse_conversation(conversation):
|
|
# parse conversation into chat dialogue
|
|
role_dict = {'user': 'HUMAN', 'assistant': 'assistant'}
|
|
chat_round = []
|
|
history = ''
|
|
if len(conversation) > 0:
|
|
for x in conversation[:-1]:
|
|
if x['role'] == 'user':
|
|
history += 'USER: ' + x['content'] + '\n\n'
|
|
elif x['role'] == 'assistant':
|
|
history += 'ASSISTANT: ' + x['content'] + '\n\n'
|
|
|
|
chat_round.append({
|
|
'role': role_dict[x['role']],
|
|
'content': x['content']
|
|
})
|
|
|
|
last_query = conversation[-1]['content']
|
|
chat_round.append({
|
|
'role': role_dict[conversation[-1]['role']],
|
|
'content': conversation[-1]['content'],
|
|
})
|
|
chat_round.append({'role': 'assistant', 'content': ''})
|
|
|
|
return chat_round, last_query, history
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class WildBenchDataset(BaseDataset):
|
|
|
|
def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
|
|
path = get_data_path(path, local_mode=True)
|
|
dataset = DatasetDict()
|
|
raw_data = []
|
|
with open(path, 'r', encoding='utf-8') as file:
|
|
for line in file:
|
|
item = json.loads(line)
|
|
chat_round, last_query, history = parse_conversation(
|
|
item['turn'])
|
|
|
|
checklist_mardkdown = ''
|
|
for checklist_item in item['checklist']:
|
|
checklist_mardkdown += f'- {checklist_item}\n'
|
|
|
|
if eval_mode == 'single':
|
|
prompt = score_prompt
|
|
elif eval_mode == 'pair':
|
|
prompt = pair_prompt
|
|
else:
|
|
assert NotImplementedError(
|
|
f'Eval mode {eval_mode} not in single or pair.')
|
|
|
|
prompt = prompt.replace('{history}', history)
|
|
prompt = prompt.replace('{user_query}', last_query)
|
|
prompt = prompt.replace('{checklist}', checklist_mardkdown)
|
|
|
|
raw_data.append({
|
|
'dialogue': chat_round,
|
|
'history': history,
|
|
'prompt': prompt,
|
|
'judge': {
|
|
'other': None,
|
|
'primary_tag': item['primary_tag'],
|
|
'secondary_tag': item['secondary_tag'],
|
|
'question_id': item['session_id'],
|
|
},
|
|
})
|
|
dataset = Dataset.from_list(raw_data)
|
|
return dataset
|
|
|
|
|
|
task_group_new = {
|
|
'Information seeking': 'Information/Advice seeking',
|
|
'Creative Writing': 'Creative Tasks',
|
|
'Coding & Debugging': 'Coding & Debugging',
|
|
'Reasoning': 'Planning & Reasoning',
|
|
'Editing': 'Creative Tasks',
|
|
'Math': 'Math & Data Analysis',
|
|
'Planning': 'Planning & Reasoning',
|
|
'Brainstorming': 'Creative Tasks',
|
|
'Role playing': 'Creative Tasks',
|
|
'Advice seeking': 'Information/Advice seeking',
|
|
'Data Analysis': 'Math & Data Analysis',
|
|
'Others': 'Creative Tasks',
|
|
}
|
|
|
|
|
|
def post_process_wildbench_pair(judgement: dict):
|
|
judgement = judgement['prediction']
|
|
pattern = r"\"choice\": \"(.*?)\""
|
|
matched_result = re.findall(pattern, judgement)
|
|
if matched_result:
|
|
return matched_result[0]
|
|
else:
|
|
return None
|
|
|
|
|
|
def post_process_wildbench_single(judgement: dict):
|
|
judgement = judgement['prediction']
|
|
pattern = r"\"score\": \"(.*?)\""
|
|
matched_result = re.findall(pattern, judgement)
|
|
try:
|
|
score = float(matched_result[0])
|
|
return {'score': score}
|
|
except (ValueError, IndexError) as e:
|
|
return None
|
|
|
|
# if matched_result:
|
|
# score = float(matched_result[0])
|
|
# else:
|
|
# return None
|
|
# return {'score': score}
|
|
|
|
|
|
@DICT_POSTPROCESSORS.register_module('wildbench')
|
|
def wildbench_postprocess(
|
|
output: dict,
|
|
output_path: str,
|
|
) -> dict:
|
|
|
|
judged_answers, references = get_judgeanswer_and_reference(
|
|
result=output,
|
|
filename=output_path,
|
|
post_process=post_process_wildbench_pair,
|
|
)
|
|
|
|
if 'base_models' in references[0]:
|
|
base_models = references[0]['base_models']
|
|
else:
|
|
base_models = ['HaiKu', 'gpt4-turbo', 'llama-2-70b-chat-hf']
|
|
|
|
if isinstance(base_models, str):
|
|
base_models = [base_models]
|
|
|
|
win_base_model = defaultdict(float)
|
|
win_compare_model = defaultdict(float)
|
|
categories = defaultdict(float)
|
|
|
|
score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1}
|
|
for judged_answer, reference in zip(judged_answers, references):
|
|
if judged_answer not in score_mapping:
|
|
continue
|
|
|
|
flag = 1 if reference['answer1'] in base_models else -1
|
|
score_1 = score_mapping[judged_answer] * flag
|
|
score_2 = -score_1
|
|
|
|
tags = [reference['primary_tag']] + reference['secondary_tag']
|
|
for tag in tags:
|
|
win_base_model[task_group_new[tag]] += score_1
|
|
win_compare_model[task_group_new[tag]] += score_2
|
|
categories[task_group_new[tag]] += 1
|
|
|
|
for capability in categories:
|
|
win_base_model[capability] = (win_base_model[capability] /
|
|
categories[capability] * 100)
|
|
win_base_model[capability] = round(win_base_model[capability], 2)
|
|
win_compare_model[capability] = (win_compare_model[capability] /
|
|
categories[capability] * 100)
|
|
win_compare_model[capability] = round(win_compare_model[capability], 2)
|
|
|
|
# Calculating the mean of the values
|
|
average = sum(win_compare_model.values()) / len(win_compare_model)
|
|
|
|
# Adding the mean to the dictionary at the beginning
|
|
win_compare_model['average'] = average
|
|
|
|
results = win_compare_model
|
|
results['details'] = output
|
|
return results
|
|
|
|
|
|
@DICT_POSTPROCESSORS.register_module('wildbench_bradleyterry')
|
|
def wildbench_bradleyterry_postprocess(
|
|
output: dict,
|
|
output_path: str,
|
|
) -> dict:
|
|
|
|
judged_answers, references = get_judgeanswer_and_reference(
|
|
result=output,
|
|
filename=output_path,
|
|
post_process=post_process_wildbench_pair,
|
|
)
|
|
|
|
if 'prediction1' not in references[0]:
|
|
raise ValueError(
|
|
'prediction1 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
|
|
)
|
|
|
|
if 'prediction2' not in references[0]:
|
|
raise ValueError(
|
|
'prediction2 not in references. Set `keep_predictions=True` for LMEvaluator in dataset config and retry.'
|
|
)
|
|
|
|
score_mapping = {
|
|
'A++': 'model_a',
|
|
'A+': 'model_a',
|
|
'A=B': 'tie',
|
|
'B+': 'model_b',
|
|
'B++': 'model_b',
|
|
}
|
|
|
|
results = {}
|
|
matches = []
|
|
for judged_answer, reference in zip(judged_answers, references):
|
|
cur_dict = {}
|
|
|
|
if judged_answer in score_mapping:
|
|
cur_dict['winner'] = score_mapping[judged_answer]
|
|
else:
|
|
# cur_dict["winner"] = (
|
|
# "tie" # Count match as tie if judge answer cannot be parsed.
|
|
# )
|
|
|
|
# Skip if judge answer cannot be parsed
|
|
print('Judge answer cannot be parsed. Skipping record...')
|
|
continue
|
|
|
|
cur_dict['primary_tag'] = reference['primary_tag']
|
|
# Extract first tag from list and set as categorical level.
|
|
# Can be used as categorical variable in Bradley-Terry model
|
|
cur_dict['secondary_tag'] = (reference['secondary_tag'][0]
|
|
if len(reference['secondary_tag']) > 0
|
|
else 'Others')
|
|
# Keep original secondary tag list for reference
|
|
cur_dict['secondary_tags'] = reference['secondary_tag']
|
|
cur_dict['model_a'] = reference['answer1']
|
|
cur_dict['model_b'] = reference['answer2']
|
|
cur_dict['prediction1'] = reference['prediction1']
|
|
cur_dict['prediction2'] = reference['prediction2']
|
|
|
|
matches.append(cur_dict)
|
|
|
|
### ---------- Add Style Metadata ---------- ###
|
|
matches = get_element_counts(
|
|
data=matches,
|
|
column='prediction1',
|
|
suffix='_a',
|
|
)
|
|
matches = get_element_counts(
|
|
data=matches,
|
|
column='prediction2',
|
|
suffix='_b',
|
|
)
|
|
|
|
results['matches'] = matches
|
|
# results["details"] = output
|
|
|
|
return results
|