OpenCompass/opencompass/datasets/corev2.py
bittersweet1999 1c95790fdd
New subjective judgement (#660)
* TabMWP

* TabMWP

* fixed

* fixed

* fixed

* done

* done

* done

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* add new subjective judgement

* modified to a more general way

* modified to a more general way

* final

* final

* add summarizer

* add new summarize

* fixed

* fixed

* fixed

---------

Co-authored-by: caomaosong <caomaosong@pjlab.org.cn>
2023-12-06 13:28:33 +08:00

71 lines
2.5 KiB
Python

# flake8: noqa: E501
import re
from collections import defaultdict
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
def match_general_answer(s):
temp = s[0]
if temp in ['A', 'B', 'C', 'D']:
return temp
else:
return None
def match_GPT4_answer(s):
if result := re.findall('(?:选择:|Choice: )([ABCD])', s):
return result[0]
else:
return None
@ICL_EVALUATORS.register_module()
class Corev2Evaluator(BaseEvaluator):
def __init__(self,
base_model,
compare_model,
judge_method='gpt4',
metric='win_rate'):
self.base_model = base_model
self.compare_model = compare_model
self.metric = metric
self.judge_method = judge_method
def score(self, predictions, references):
if self.judge_method == 'gpt4':
predictions = [match_GPT4_answer(s) for s in predictions]
else:
predictions = [match_general_answer(s) for s in predictions]
print(
f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
)
win_both, half_draw, categories = defaultdict(float), defaultdict(
float), defaultdict(float)
for prediction, reference in zip(predictions, references):
if prediction is not None:
categories[reference['capability'].split('-')[0]] += 1
winner = ''
if prediction == 'A':
winner = reference['model1']
elif prediction == 'B':
winner = reference['model2']
elif prediction == 'C':
win_both[reference['capability'].split('-')[0]] += 1
if self.base_model == winner:
half_draw[reference['capability'].split('-')[0]] += 1
win_both[reference['capability'].split('-')[0]] += 1
for capability in categories:
if capability not in half_draw:
win_both[capability] = 0.0
half_draw[capability] = 0.0
else:
win_both[capability] = round(
(win_both[capability] / categories[capability]) * 100, 2)
half_draw[capability] = round(
(half_draw[capability] / categories[capability]) * 100, 2)
scores = {'win_both': win_both, 'half_draw': half_draw}
return scores