mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
370 lines
13 KiB
Python
370 lines
13 KiB
Python
# flake8: noqa
|
||
import json
|
||
import os
|
||
import re
|
||
from collections import defaultdict
|
||
|
||
from .icl_base_evaluator import BaseEvaluator
|
||
|
||
|
||
class JudgeEvaluator(BaseEvaluator):
|
||
|
||
def score(self, predictions, references):
|
||
if len(predictions) != len(references):
|
||
return {'error': 'preds and refrs have different length'}
|
||
correct = 0
|
||
count = 0
|
||
details = []
|
||
for prediction, reference in zip(predictions, references):
|
||
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
|
||
gold_winner = reference.get('winner', '')
|
||
detail = {
|
||
'pred': prediction,
|
||
'answer': gold_winner,
|
||
'correct': False
|
||
}
|
||
count += 1
|
||
if choice == gold_winner:
|
||
correct += 1
|
||
detail['correct'] = True
|
||
details.append(detail)
|
||
result = {'accuracy': 100 * correct / count, 'details': details}
|
||
return result
|
||
|
||
|
||
class RMBEvaluator(BaseEvaluator):
|
||
|
||
def calculate_pair_accuracy(self, data):
|
||
correct = 0
|
||
total = 0
|
||
for item in data:
|
||
choice = item['choice']
|
||
gold_winner = item['gold_winner']
|
||
if choice and gold_winner:
|
||
total += 1
|
||
if gold_winner == choice:
|
||
correct += 1
|
||
|
||
return correct / total if total > 0 else 0
|
||
|
||
def calculate_bon_accuracy(self, data):
|
||
bon_groups = defaultdict(list)
|
||
"""计算bon指标的准确率"""
|
||
|
||
for item in data:
|
||
bon_uid = item['bon_uid']
|
||
if bon_uid:
|
||
choice = item['choice']
|
||
gold_winner = item['gold_winner']
|
||
if choice and gold_winner:
|
||
bon_groups[bon_uid].append(gold_winner == choice)
|
||
|
||
# 计算每个bon_uid是否全部正确
|
||
correct_bons = 0
|
||
for bon_uid, matches in bon_groups.items():
|
||
if all(matches):
|
||
correct_bons += 1
|
||
|
||
return correct_bons / len(bon_groups) if bon_groups else 0
|
||
|
||
def score(self, predictions, references):
|
||
if len(predictions) != len(references):
|
||
return {'error': 'preds and refrs have different length'}
|
||
|
||
# 创建四个数据列表,分别对应不同的subset和goal组合
|
||
bon_help_list = []
|
||
bon_harm_list = []
|
||
pair_help_list = []
|
||
pair_harm_list = []
|
||
|
||
# 根据subset和goal分类数据
|
||
for prediction, reference in zip(predictions, references):
|
||
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
|
||
gold_winner = reference.get('winner', '')
|
||
subset = reference.get('subset', '')
|
||
goal = reference.get('goal', '')
|
||
|
||
data_item = {
|
||
'choice': choice,
|
||
'gold_winner': gold_winner,
|
||
'bon_uid': reference.get('bon_uid', ''),
|
||
'pair_uid': reference.get('pair_uid', ''),
|
||
}
|
||
|
||
# 根据subset和goal将数据分配到对应的列表中
|
||
if subset == 'bon':
|
||
if goal == 'Helpfulness':
|
||
bon_help_list.append(data_item)
|
||
elif goal == 'Harmlessness':
|
||
bon_harm_list.append(data_item)
|
||
elif subset == 'pair':
|
||
if goal == 'Helpfulness':
|
||
pair_help_list.append(data_item)
|
||
elif goal == 'Harmlessness':
|
||
pair_harm_list.append(data_item)
|
||
|
||
# 计算四种组合的准确率
|
||
bon_help_acc = self.calculate_bon_accuracy(
|
||
bon_help_list) if bon_help_list else 0
|
||
bon_harm_acc = self.calculate_bon_accuracy(
|
||
bon_harm_list) if bon_harm_list else 0
|
||
pair_help_acc = self.calculate_pair_accuracy(
|
||
pair_help_list) if pair_help_list else 0
|
||
pair_harm_acc = self.calculate_pair_accuracy(
|
||
pair_harm_list) if pair_harm_list else 0
|
||
|
||
# 返回所有结果
|
||
result = {
|
||
'bon_helpfulness_accuracy':
|
||
bon_help_acc * 100,
|
||
'bon_harmlessness_accuracy':
|
||
bon_harm_acc * 100,
|
||
'pair_helpfulness_accuracy':
|
||
pair_help_acc * 100,
|
||
'pair_harmlessness_accuracy':
|
||
pair_harm_acc * 100,
|
||
'bon_average': ((bon_help_acc + bon_harm_acc) / 2) * 100,
|
||
'pair_average': ((pair_help_acc + pair_harm_acc) / 2) * 100,
|
||
'total_accuracy':
|
||
((bon_help_acc + bon_harm_acc + pair_help_acc + pair_harm_acc) / 4)
|
||
* 100
|
||
}
|
||
|
||
return result
|
||
|
||
|
||
R1_Score_MAP = {
|
||
'Knowledge': {
|
||
'Qwen2.5-32B-Instruct': 55,
|
||
'Llama-3.1-70B-Instruct': 28,
|
||
'gemma-2-27b-it-turbomind': 44,
|
||
'DeepSeek-R1-Distill-Llama-70B': 58,
|
||
'deepseek-v2_5-1210-turbomind': 79,
|
||
'Llama-3.3-70B-Instruct': 46,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 56,
|
||
'mixtral-large-instruct-2407-lmdeploy': 72,
|
||
'Qwen2.5-72B-Instruct': 80
|
||
},
|
||
'Longtext': {
|
||
'Qwen2.5-32B-Instruct': 45,
|
||
'Llama-3.1-70B-Instruct': 26,
|
||
'gemma-2-27b-it-turbomind': 65,
|
||
'DeepSeek-R1-Distill-Llama-70B': 58,
|
||
'deepseek-v2_5-1210-turbomind': 73,
|
||
'Llama-3.3-70B-Instruct': 37,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 54,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 52,
|
||
'mixtral-large-instruct-2407-lmdeploy': 63,
|
||
'Qwen2.5-72B-Instruct': 77
|
||
},
|
||
'Reason_and_analysis': {
|
||
'Qwen2.5-32B-Instruct': 60,
|
||
'Llama-3.1-70B-Instruct': 23,
|
||
'gemma-2-27b-it-turbomind': 46,
|
||
'DeepSeek-R1-Distill-Llama-70B': 63,
|
||
'deepseek-v2_5-1210-turbomind': 85,
|
||
'Llama-3.3-70B-Instruct': 45,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 68,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 66,
|
||
'mixtral-large-instruct-2407-lmdeploy': 56,
|
||
'Qwen2.5-72B-Instruct': 78
|
||
},
|
||
'safe': {
|
||
'Qwen2.5-32B-Instruct': 72,
|
||
'Llama-3.1-70B-Instruct': 55,
|
||
'gemma-2-27b-it-turbomind': 72,
|
||
'DeepSeek-R1-Distill-Llama-70B': 55,
|
||
'deepseek-v2_5-1210-turbomind': 72,
|
||
'Llama-3.3-70B-Instruct': 64,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 76,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 55,
|
||
'mixtral-large-instruct-2407-lmdeploy': 69,
|
||
'Qwen2.5-72B-Instruct': 83
|
||
},
|
||
'Hallucination': {
|
||
'Qwen2.5-32B-Instruct': 78,
|
||
'Llama-3.1-70B-Instruct': 50,
|
||
'gemma-2-27b-it-turbomind': 65,
|
||
'DeepSeek-R1-Distill-Llama-70B': 61,
|
||
'deepseek-v2_5-1210-turbomind': 66,
|
||
'Llama-3.3-70B-Instruct': 48,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 75,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 60,
|
||
'mixtral-large-instruct-2407-lmdeploy': 76,
|
||
'Qwen2.5-72B-Instruct': 74
|
||
},
|
||
'chatQA': {
|
||
'Qwen2.5-32B-Instruct': 39,
|
||
'Llama-3.1-70B-Instruct': 25,
|
||
'gemma-2-27b-it-turbomind': 56,
|
||
'DeepSeek-R1-Distill-Llama-70B': 53,
|
||
'deepseek-v2_5-1210-turbomind': 70,
|
||
'Llama-3.3-70B-Instruct': 34,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 48,
|
||
'mixtral-large-instruct-2407-lmdeploy': 55,
|
||
'Qwen2.5-72B-Instruct': 68
|
||
},
|
||
'IF': {
|
||
'Qwen2.5-32B-Instruct': 34,
|
||
'Llama-3.1-70B-Instruct': 35,
|
||
'gemma-2-27b-it-turbomind': 38,
|
||
'DeepSeek-R1-Distill-Llama-70B': 50,
|
||
'deepseek-v2_5-1210-turbomind': 63,
|
||
'Llama-3.3-70B-Instruct': 37,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 41,
|
||
'mixtral-large-instruct-2407-lmdeploy': 47,
|
||
'Qwen2.5-72B-Instruct': 48
|
||
},
|
||
'LanTask': {
|
||
'Qwen2.5-32B-Instruct': 62,
|
||
'Llama-3.1-70B-Instruct': 29,
|
||
'gemma-2-27b-it-turbomind': 53,
|
||
'DeepSeek-R1-Distill-Llama-70B': 60,
|
||
'deepseek-v2_5-1210-turbomind': 75,
|
||
'Llama-3.3-70B-Instruct': 46,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 69,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 71,
|
||
'mixtral-large-instruct-2407-lmdeploy': 48,
|
||
'Qwen2.5-72B-Instruct': 74
|
||
},
|
||
'Creation': {
|
||
'Qwen2.5-32B-Instruct': 40,
|
||
'Llama-3.1-70B-Instruct': 34,
|
||
'gemma-2-27b-it-turbomind': 55,
|
||
'DeepSeek-R1-Distill-Llama-70B': 66,
|
||
'deepseek-v2_5-1210-turbomind': 73,
|
||
'Llama-3.3-70B-Instruct': 36,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 73,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 64,
|
||
'mixtral-large-instruct-2407-lmdeploy': 43,
|
||
'Qwen2.5-72B-Instruct': 67
|
||
},
|
||
'Code_and_AI': {
|
||
'Qwen2.5-32B-Instruct': 44,
|
||
'Llama-3.1-70B-Instruct': 32,
|
||
'gemma-2-27b-it-turbomind': 34,
|
||
'DeepSeek-R1-Distill-Llama-70B': 56,
|
||
'deepseek-v2_5-1210-turbomind': 64,
|
||
'Llama-3.3-70B-Instruct': 43,
|
||
'nvidia-Llama-3.1-Nemotron-70B-Instruct-HF': 62,
|
||
'DeepSeek-R1-Distill-Qwen-32B': 43,
|
||
'mixtral-large-instruct-2407-lmdeploy': 51,
|
||
'Qwen2.5-72B-Instruct': 60
|
||
}
|
||
}
|
||
|
||
|
||
class Judgerbenchv2Evaluator(BaseEvaluator):
|
||
|
||
def get_rank_dict(self, score_dict):
|
||
sorted_models = sorted(score_dict.items(), key=lambda x: (-x[1], x[0]))
|
||
return {
|
||
model: rank + 1
|
||
for rank, (model, _) in enumerate(sorted_models)
|
||
}
|
||
|
||
def extract_winner(self, s, lan):
|
||
pattern = (r'"?(胜者)"?\s*:\s*"([A-Z])"' if lan.lower() in ['zh', 'cn']
|
||
else r'"?(winner)"?\s*:\s*"([A-Z])"')
|
||
|
||
matches = re.findall(pattern, s)
|
||
|
||
return matches[-1][1] if matches else None
|
||
|
||
def score(self, predictions, references):
|
||
if len(predictions) != len(references):
|
||
return {'error': 'preds and refrs have different length'}
|
||
correct = 0
|
||
count = 0
|
||
details = []
|
||
Model_dict = {}
|
||
for prediction, reference in zip(predictions, references):
|
||
# pre-defines
|
||
ModelA = reference['ModelA']
|
||
ModelB = reference['ModelB']
|
||
|
||
if reference['category'] == 'Reason & Analysis':
|
||
r1_rank_score = R1_Score_MAP['Reason_and_analysis']
|
||
elif reference['category'] == 'Code & AI':
|
||
r1_rank_score = R1_Score_MAP['Code_and_AI']
|
||
else:
|
||
r1_rank_score = R1_Score_MAP[reference['category']]
|
||
|
||
choice = self.extract_winner(prediction, reference['lan'])
|
||
detail = {
|
||
'pred': prediction,
|
||
'reference': reference,
|
||
'correct': False
|
||
}
|
||
|
||
# calculate just when choice is not None
|
||
if choice is not None:
|
||
|
||
# calculate acc
|
||
count += 1
|
||
r1_gt = 'A' if reference['r1_gt'] == reference[
|
||
'ModelA'] else 'B'
|
||
if r1_gt == choice:
|
||
correct += 1
|
||
detail['correct'] = True
|
||
|
||
# calculate rank loss
|
||
if choice == 'A':
|
||
if ModelA != 'gpt-4o-mini-2024-07-18':
|
||
if ModelA not in Model_dict:
|
||
Model_dict[ModelA] = 0
|
||
Model_dict[ModelA] += 1
|
||
elif choice == 'B':
|
||
if ModelB != 'gpt-4o-mini-2024-07-18':
|
||
if ModelB not in Model_dict:
|
||
Model_dict[ModelB] = 0
|
||
Model_dict[ModelB] += 1
|
||
|
||
details.append(detail)
|
||
|
||
# calculate rank loss
|
||
dict1 = dict(sorted(Model_dict.items()))
|
||
dict2 = dict(sorted(r1_rank_score.items()))
|
||
|
||
rank1 = self.get_rank_dict(dict1)
|
||
rank2 = self.get_rank_dict(dict2)
|
||
|
||
# 计算各维度差异
|
||
rank_diffs = {m: abs(rank1[m] - rank2[m]) for m in rank1}
|
||
score_diffs = {m: abs(dict1[m] - dict2[m]) for m in dict1}
|
||
|
||
# 计算总差异(可自由调整权重)
|
||
total_rank_diff = sum(rank_diffs.values()) # 例如原排名总差距 = 14
|
||
total_score_diff = sum(score_diffs.values()) # 例如总分数差距 = 75
|
||
alpha = 0.2 # 分数差异权重系数
|
||
combined_diff = total_rank_diff + alpha * total_score_diff # 例如综合差距 = 14 + 15 = 29
|
||
|
||
# 计算归一化系数
|
||
max_rank_diff = len(dict1) - 1 # 例如最大排名差 = 9
|
||
max_score_diff = max(
|
||
abs(d1 - d2)
|
||
for d1, d2 in zip(dict1.values(), dict2.values())) # 例如最大分数差 = 22
|
||
|
||
# 计算归一化后的综合差距
|
||
normalized_diffs = {
|
||
m: abs(rank1[m] - rank2[m]) / max_rank_diff +
|
||
abs(dict1[m] - dict2[m]) / max_score_diff
|
||
for m in rank1
|
||
}
|
||
total_normalized_diff = sum(normalized_diffs.values()) / len(
|
||
normalized_diffs.values()) * 100
|
||
acc = 100 * correct / count
|
||
final_score = acc - total_normalized_diff
|
||
result = {
|
||
'accuracy': acc,
|
||
'rank_diff': total_rank_diff,
|
||
'score_diff': total_score_diff,
|
||
'normalized_diff': total_normalized_diff,
|
||
'final_score': final_score,
|
||
'details': details
|
||
}
|
||
return result
|