mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
![]() |
# flake8: noqa: E501
|
||
|
import re
|
||
|
from collections import defaultdict
|
||
|
|
||
|
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
|
||
|
from opencompass.registry import ICL_EVALUATORS
|
||
|
|
||
|
|
||
|
def match_general_answer(s):
|
||
|
temp = s[0]
|
||
|
if temp in ['A', 'B', 'C', 'D']:
|
||
|
return temp
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
def match_GPT4_answer(s):
|
||
|
if result := re.findall('(?:选择:|Choice: )([ABCD])', s):
|
||
|
return result[0]
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
@ICL_EVALUATORS.register_module()
|
||
|
class Corev2Evaluator(BaseEvaluator):
|
||
|
|
||
|
def __init__(self,
|
||
|
base_model,
|
||
|
compare_model,
|
||
|
judge_method='gpt4',
|
||
|
metric='win_rate'):
|
||
|
self.base_model = base_model
|
||
|
self.compare_model = compare_model
|
||
|
self.metric = metric
|
||
|
self.judge_method = judge_method
|
||
|
|
||
|
def score(self, predictions, references):
|
||
|
if self.judge_method == 'gpt4':
|
||
|
predictions = [match_GPT4_answer(s) for s in predictions]
|
||
|
else:
|
||
|
predictions = [match_general_answer(s) for s in predictions]
|
||
|
print(
|
||
|
f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
|
||
|
)
|
||
|
win_both, half_draw, categories = defaultdict(float), defaultdict(
|
||
|
float), defaultdict(float)
|
||
|
for prediction, reference in zip(predictions, references):
|
||
|
if prediction is not None:
|
||
|
categories[reference['capability'].split('-')[0]] += 1
|
||
|
winner = ''
|
||
|
if prediction == 'A':
|
||
|
winner = reference['model1']
|
||
|
elif prediction == 'B':
|
||
|
winner = reference['model2']
|
||
|
elif prediction == 'C':
|
||
|
win_both[reference['capability'].split('-')[0]] += 1
|
||
|
if self.base_model == winner:
|
||
|
half_draw[reference['capability'].split('-')[0]] += 1
|
||
|
win_both[reference['capability'].split('-')[0]] += 1
|
||
|
for capability in categories:
|
||
|
if capability not in half_draw:
|
||
|
win_both[capability] = 0.0
|
||
|
half_draw[capability] = 0.0
|
||
|
else:
|
||
|
win_both[capability] = round(
|
||
|
(win_both[capability] / categories[capability]) * 100, 2)
|
||
|
half_draw[capability] = round(
|
||
|
(half_draw[capability] / categories[capability]) * 100, 2)
|
||
|
scores = {'win_both': win_both, 'half_draw': half_draw}
|
||
|
return scores
|