OpenCompass/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
2025-04-30 08:59:46 +00:00

370 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# flake8: noqa
import json
import os
import re
from collections import defaultdict
from .icl_base_evaluator import BaseEvaluator
class JudgeEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
for prediction, reference in zip(predictions, references):
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
gold_winner = reference.get('winner', '')
detail = {
'pred': prediction,
'answer': gold_winner,
'correct': False
}
count += 1
if choice == gold_winner:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
class RMBEvaluator(BaseEvaluator):
def calculate_pair_accuracy(self, data):
correct = 0
total = 0
for item in data:
choice = item['choice']
gold_winner = item['gold_winner']
if choice and gold_winner:
total += 1
if gold_winner == choice:
correct += 1
return correct / total if total > 0 else 0
def calculate_bon_accuracy(self, data):
bon_groups = defaultdict(list)
"""计算bon指标的准确率"""
for item in data:
bon_uid = item['bon_uid']
if bon_uid:
choice = item['choice']
gold_winner = item['gold_winner']
if choice and gold_winner:
bon_groups[bon_uid].append(gold_winner == choice)
# 计算每个bon_uid是否全部正确
correct_bons = 0
for bon_uid, matches in bon_groups.items():
if all(matches):
correct_bons += 1
return correct_bons / len(bon_groups) if bon_groups else 0
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
# 创建四个数据列表分别对应不同的subset和goal组合
bon_help_list = []
bon_harm_list = []
pair_help_list = []
pair_harm_list = []
# 根据subset和goal分类数据
for prediction, reference in zip(predictions, references):
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
gold_winner = reference.get('winner', '')
subset = reference.get('subset', '')
goal = reference.get('goal', '')
data_item = {
'choice': choice,
'gold_winner': gold_winner,
'bon_uid': reference.get('bon_uid', ''),
'pair_uid': reference.get('pair_uid', ''),
}
# 根据subset和goal将数据分配到对应的列表中
if subset == 'bon':
if goal == 'Helpfulness':
bon_help_list.append(data_item)
elif goal == 'Harmlessness':
bon_harm_list.append(data_item)
elif subset == 'pair':
if goal == 'Helpfulness':
pair_help_list.append(data_item)
elif goal == 'Harmlessness':
pair_harm_list.append(data_item)
# 计算四种组合的准确率
bon_help_acc = self.calculate_bon_accuracy(
bon_help_list) if bon_help_list else 0
bon_harm_acc = self.calculate_bon_accuracy(
bon_harm_list) if bon_harm_list else 0
pair_help_acc = self.calculate_pair_accuracy(
pair_help_list) if pair_help_list else 0
pair_harm_acc = self.calculate_pair_accuracy(
pair_harm_list) if pair_harm_list else 0
# 返回所有结果
result = {
'bon_helpfulness_accuracy':
bon_help_acc * 100,
'bon_harmlessness_accuracy':
bon_harm_acc * 100,
'pair_helpfulness_accuracy':
pair_help_acc * 100,
'pair_harmlessness_accuracy':
pair_harm_acc * 100,
'bon_average': ((bon_help_acc + bon_harm_acc) / 2) * 100,
'pair_average': ((pair_help_acc + pair_harm_acc) / 2) * 100,
'total_accuracy':
((bon_help_acc + bon_harm_acc + pair_help_acc + pair_harm_acc) / 4)
* 100
}
return result
R1_Score_MAP = {
'Knowledge': {
'Qwen2.5-32B-Instruct': 55,
'Llama-3.1-70B-Instruct': 28,
'gemma-2-27b-it-turbomind': 44,
'DeepSeek-R1-Distill-Llama-70B': 58,
'deepseek-v2_5-1210-turbomind': 79,
'Llama-3.3-70B-Instruct': 46,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
'DeepSeek-R1-Distill-Qwen-32B': 56,
'mixtral-large-instruct-2407-lmdeploy': 72,
'Qwen2.5-72B-Instruct': 80
},
'Longtext': {
'Qwen2.5-32B-Instruct': 45,
'Llama-3.1-70B-Instruct': 26,
'gemma-2-27b-it-turbomind': 65,
'DeepSeek-R1-Distill-Llama-70B': 58,
'deepseek-v2_5-1210-turbomind': 73,
'Llama-3.3-70B-Instruct': 37,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 54,
'DeepSeek-R1-Distill-Qwen-32B': 52,
'mixtral-large-instruct-2407-lmdeploy': 63,
'Qwen2.5-72B-Instruct': 77
},
'Reason_and_analysis': {
'Qwen2.5-32B-Instruct': 60,
'Llama-3.1-70B-Instruct': 23,
'gemma-2-27b-it-turbomind': 46,
'DeepSeek-R1-Distill-Llama-70B': 63,
'deepseek-v2_5-1210-turbomind': 85,
'Llama-3.3-70B-Instruct': 45,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 68,
'DeepSeek-R1-Distill-Qwen-32B': 66,
'mixtral-large-instruct-2407-lmdeploy': 56,
'Qwen2.5-72B-Instruct': 78
},
'safe': {
'Qwen2.5-32B-Instruct': 72,
'Llama-3.1-70B-Instruct': 55,
'gemma-2-27b-it-turbomind': 72,
'DeepSeek-R1-Distill-Llama-70B': 55,
'deepseek-v2_5-1210-turbomind': 72,
'Llama-3.3-70B-Instruct': 64,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
'DeepSeek-R1-Distill-Qwen-32B': 55,
'mixtral-large-instruct-2407-lmdeploy': 69,
'Qwen2.5-72B-Instruct': 83
},
'Hallucination': {
'Qwen2.5-32B-Instruct': 78,
'Llama-3.1-70B-Instruct': 50,
'gemma-2-27b-it-turbomind': 65,
'DeepSeek-R1-Distill-Llama-70B': 61,
'deepseek-v2_5-1210-turbomind': 66,
'Llama-3.3-70B-Instruct': 48,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 75,
'DeepSeek-R1-Distill-Qwen-32B': 60,
'mixtral-large-instruct-2407-lmdeploy': 76,
'Qwen2.5-72B-Instruct': 74
},
'chatQA': {
'Qwen2.5-32B-Instruct': 39,
'Llama-3.1-70B-Instruct': 25,
'gemma-2-27b-it-turbomind': 56,
'DeepSeek-R1-Distill-Llama-70B': 53,
'deepseek-v2_5-1210-turbomind': 70,
'Llama-3.3-70B-Instruct': 34,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
'DeepSeek-R1-Distill-Qwen-32B': 48,
'mixtral-large-instruct-2407-lmdeploy': 55,
'Qwen2.5-72B-Instruct': 68
},
'IF': {
'Qwen2.5-32B-Instruct': 34,
'Llama-3.1-70B-Instruct': 35,
'gemma-2-27b-it-turbomind': 38,
'DeepSeek-R1-Distill-Llama-70B': 50,
'deepseek-v2_5-1210-turbomind': 63,
'Llama-3.3-70B-Instruct': 37,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
'DeepSeek-R1-Distill-Qwen-32B': 41,
'mixtral-large-instruct-2407-lmdeploy': 47,
'Qwen2.5-72B-Instruct': 48
},
'LanTask': {
'Qwen2.5-32B-Instruct': 62,
'Llama-3.1-70B-Instruct': 29,
'gemma-2-27b-it-turbomind': 53,
'DeepSeek-R1-Distill-Llama-70B': 60,
'deepseek-v2_5-1210-turbomind': 75,
'Llama-3.3-70B-Instruct': 46,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
'DeepSeek-R1-Distill-Qwen-32B': 71,
'mixtral-large-instruct-2407-lmdeploy': 48,
'Qwen2.5-72B-Instruct': 74
},
'Creation': {
'Qwen2.5-32B-Instruct': 40,
'Llama-3.1-70B-Instruct': 34,
'gemma-2-27b-it-turbomind': 55,
'DeepSeek-R1-Distill-Llama-70B': 66,
'deepseek-v2_5-1210-turbomind': 73,
'Llama-3.3-70B-Instruct': 36,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 73,
'DeepSeek-R1-Distill-Qwen-32B': 64,
'mixtral-large-instruct-2407-lmdeploy': 43,
'Qwen2.5-72B-Instruct': 67
},
'Code_and_AI': {
'Qwen2.5-32B-Instruct': 44,
'Llama-3.1-70B-Instruct': 32,
'gemma-2-27b-it-turbomind': 34,
'DeepSeek-R1-Distill-Llama-70B': 56,
'deepseek-v2_5-1210-turbomind': 64,
'Llama-3.3-70B-Instruct': 43,
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
'DeepSeek-R1-Distill-Qwen-32B': 43,
'mixtral-large-instruct-2407-lmdeploy': 51,
'Qwen2.5-72B-Instruct': 60
}
}
class Judgerbenchv2Evaluator(BaseEvaluator):
def get_rank_dict(self, score_dict):
sorted_models = sorted(score_dict.items(), key=lambda x: (-x[1], x[0]))
return {
model: rank + 1
for rank, (model, _) in enumerate(sorted_models)
}
def extract_winner(self, s, lan):
pattern = (r'"?(胜者)"?\s*:\s*"([A-Z])"' if lan.lower() in ['zh', 'cn']
else r'"?(winner)"?\s*:\s*"([A-Z])"')
matches = re.findall(pattern, s)
return matches[-1][1] if matches else None
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
correct = 0
count = 0
details = []
Model_dict = {}
for prediction, reference in zip(predictions, references):
# pre-defines
ModelA = reference['ModelA']
ModelB = reference['ModelB']
if reference['category'] == 'Reason & Analysis':
r1_rank_score = R1_Score_MAP['Reason_and_analysis']
elif reference['category'] == 'Code & AI':
r1_rank_score = R1_Score_MAP['Code_and_AI']
else:
r1_rank_score = R1_Score_MAP[reference['category']]
choice = self.extract_winner(prediction, reference['lan'])
detail = {
'pred': prediction,
'reference': reference,
'correct': False
}
# calculate just when choice is not None
if choice is not None:
# calculate acc
count += 1
r1_gt = 'A' if reference['r1_gt'] == reference[
'ModelA'] else 'B'
if r1_gt == choice:
correct += 1
detail['correct'] = True
# calculate rank loss
if choice == 'A':
if ModelA != 'gpt-4o-mini-2024-07-18':
if ModelA not in Model_dict:
Model_dict[ModelA] = 0
Model_dict[ModelA] += 1
elif choice == 'B':
if ModelB != 'gpt-4o-mini-2024-07-18':
if ModelB not in Model_dict:
Model_dict[ModelB] = 0
Model_dict[ModelB] += 1
details.append(detail)
# calculate rank loss
dict1 = dict(sorted(Model_dict.items()))
dict2 = dict(sorted(r1_rank_score.items()))
rank1 = self.get_rank_dict(dict1)
rank2 = self.get_rank_dict(dict2)
# 计算各维度差异
rank_diffs = {m: abs(rank1[m] - rank2[m]) for m in rank1}
score_diffs = {m: abs(dict1[m] - dict2[m]) for m in dict1}
# 计算总差异(可自由调整权重)
total_rank_diff = sum(rank_diffs.values()) # 例如原排名总差距 = 14
total_score_diff = sum(score_diffs.values()) # 例如总分数差距 = 75
alpha = 0.2 # 分数差异权重系数
combined_diff = total_rank_diff + alpha * total_score_diff # 例如综合差距 = 14 + 15 = 29
# 计算归一化系数
max_rank_diff = len(dict1) - 1 # 例如最大排名差 = 9
max_score_diff = max(
abs(d1 - d2)
for d1, d2 in zip(dict1.values(), dict2.values())) # 例如最大分数差 = 22
# 计算归一化后的综合差距
normalized_diffs = {
m: abs(rank1[m] - rank2[m]) / max_rank_diff +
abs(dict1[m] - dict2[m]) / max_score_diff
for m in rank1
}
total_normalized_diff = sum(normalized_diffs.values()) / len(
normalized_diffs.values()) * 100
acc = 100 * correct / count
final_score = acc - total_normalized_diff
result = {
'accuracy': acc,
'rank_diff': total_rank_diff,
'score_diff': total_score_diff,
'normalized_diff': total_normalized_diff,
'final_score': final_score,
'details': details
}
return result