OpenCompass/opencompass/datasets/corev2.py

# flake8: noqa: E501
import re
from collections import defaultdict

from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS


def match_general_answer(s):
    temp = s[0]
    if temp in ['A', 'B', 'C', 'D']:
        return temp
    else:
        return None


def match_GPT4_answer(s):
    if result := re.findall('(?:选择：|Choice: )([ABCD])', s):
        return result[0]
    else:
        return None


@ICL_EVALUATORS.register_module()
class Corev2Evaluator(BaseEvaluator):

    def __init__(self,
                 base_model,
                 compare_model,
                 judge_method='gpt4',
                 metric='win_rate'):
        self.base_model = base_model
        self.compare_model = compare_model
        self.metric = metric
        self.judge_method = judge_method

    def score(self, predictions, references):
        if self.judge_method == 'gpt4':
            predictions = [match_GPT4_answer(s) for s in predictions]
        else:
            predictions = [match_general_answer(s) for s in predictions]
        print(
            f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
        )
        win_both, half_draw, categories = defaultdict(float), defaultdict(
            float), defaultdict(float)
        for prediction, reference in zip(predictions, references):
            if prediction is not None:
                categories[reference['capability'].split('-')[0]] += 1
                winner = ''
                if prediction == 'A':
                    winner = reference['model1']
                elif prediction == 'B':
                    winner = reference['model2']
                elif prediction == 'C':
                    win_both[reference['capability'].split('-')[0]] += 1
                if self.base_model == winner:
                    half_draw[reference['capability'].split('-')[0]] += 1
                    win_both[reference['capability'].split('-')[0]] += 1
        for capability in categories:
            if capability not in half_draw:
                win_both[capability] = 0.0
                half_draw[capability] = 0.0
            else:
                win_both[capability] = round(
                    (win_both[capability] / categories[capability]) * 100, 2)
                half_draw[capability] = round(
                    (half_draw[capability] / categories[capability]) * 100, 2)
        scores = {'win_both': win_both, 'half_draw': half_draw}
        return scores