mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* TabMWP * TabMWP * fixed * fixed * fixed * done * done * done * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * modified to a more general way * modified to a more general way * final * final * add summarizer * add new summarize * fixed * fixed * fixed --------- Co-authored-by: caomaosong <caomaosong@pjlab.org.cn>
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
# flake8: noqa: E501
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
|
|
from opencompass.registry import ICL_EVALUATORS
|
|
|
|
|
|
def match_general_answer(s):
|
|
temp = s[0]
|
|
if temp in ['A', 'B', 'C', 'D']:
|
|
return temp
|
|
else:
|
|
return None
|
|
|
|
|
|
def match_GPT4_answer(s):
|
|
if result := re.findall('(?:选择:|Choice: )([ABCD])', s):
|
|
return result[0]
|
|
else:
|
|
return None
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class Corev2Evaluator(BaseEvaluator):
|
|
|
|
def __init__(self,
|
|
base_model,
|
|
compare_model,
|
|
judge_method='gpt4',
|
|
metric='win_rate'):
|
|
self.base_model = base_model
|
|
self.compare_model = compare_model
|
|
self.metric = metric
|
|
self.judge_method = judge_method
|
|
|
|
def score(self, predictions, references):
|
|
if self.judge_method == 'gpt4':
|
|
predictions = [match_GPT4_answer(s) for s in predictions]
|
|
else:
|
|
predictions = [match_general_answer(s) for s in predictions]
|
|
print(
|
|
f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
|
|
)
|
|
win_both, half_draw, categories = defaultdict(float), defaultdict(
|
|
float), defaultdict(float)
|
|
for prediction, reference in zip(predictions, references):
|
|
if prediction is not None:
|
|
categories[reference['capability'].split('-')[0]] += 1
|
|
winner = ''
|
|
if prediction == 'A':
|
|
winner = reference['model1']
|
|
elif prediction == 'B':
|
|
winner = reference['model2']
|
|
elif prediction == 'C':
|
|
win_both[reference['capability'].split('-')[0]] += 1
|
|
if self.base_model == winner:
|
|
half_draw[reference['capability'].split('-')[0]] += 1
|
|
win_both[reference['capability'].split('-')[0]] += 1
|
|
for capability in categories:
|
|
if capability not in half_draw:
|
|
win_both[capability] = 0.0
|
|
half_draw[capability] = 0.0
|
|
else:
|
|
win_both[capability] = round(
|
|
(win_both[capability] / categories[capability]) * 100, 2)
|
|
half_draw[capability] = round(
|
|
(half_draw[capability] / categories[capability]) * 100, 2)
|
|
scores = {'win_both': win_both, 'half_draw': half_draw}
|
|
return scores
|