OpenCompass/opencompass/datasets/corev2.py

# flake8: noqa: E501
import re
from collections import defaultdict

from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS


def match_general_answer(s):
    temp = s[0]
    if temp in ['A', 'B', 'C', 'D']:
        return temp
    else:
        return None


def match_GPT4_answer(s):
    if result := re.findall('(?:选择：|Choice: )([ABCD])', s):
        return result[0]
    else:
        return None


@ICL_EVALUATORS.register_module()
class Corev2Evaluator(BaseEvaluator):

    def __init__(self,
                 base_model,
                 compare_model,
                 judge_method='gpt4',
                 metric='win_rate'):
        self.base_model = base_model
        self.compare_model = compare_model
        self.metric = metric
        self.judge_method = judge_method

    def score(self, predictions, references):
        if self.judge_method == 'gpt4':
            predictions = [match_GPT4_answer(s) for s in predictions]
        else:
            predictions = [match_general_answer(s) for s in predictions]
        print(
            f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'
        )
        win_both, half_draw, categories = defaultdict(float), defaultdict(
            float), defaultdict(float)
        for prediction, reference in zip(predictions, references):
            if prediction is not None:
                categories[reference['capability'].split('-')[0]] += 1
                winner = ''
                if prediction == 'A':
                    winner = reference['model1']
                elif prediction == 'B':
                    winner = reference['model2']
                elif prediction == 'C':
                    win_both[reference['capability'].split('-')[0]] += 1
                if self.base_model == winner:
                    half_draw[reference['capability'].split('-')[0]] += 1
                    win_both[reference['capability'].split('-')[0]] += 1
        for capability in categories:
            if capability not in half_draw:
                win_both[capability] = 0.0
                half_draw[capability] = 0.0
            else:
                win_both[capability] = round(
                    (win_both[capability] / categories[capability]) * 100, 2)
                half_draw[capability] = round(
                    (half_draw[capability] / categories[capability]) * 100, 2)
        scores = {'win_both': win_both, 'half_draw': half_draw}
        return scores
New subjective judgement (#660) * TabMWP * TabMWP * fixed * fixed * fixed * done * done * done * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * add new subjective judgement * modified to a more general way * modified to a more general way * final * final * add summarizer * add new summarize * fixed * fixed * fixed --------- Co-authored-by: caomaosong <caomaosong@pjlab.org.cn> 2023-12-06 13:28:33 +08:00			`# flake8: noqa: E501`
			`import re`
			`from collections import defaultdict`

			`from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator`
			`from opencompass.registry import ICL_EVALUATORS`


			`def match_general_answer(s):`
			`temp = s[0]`
			`if temp in ['A', 'B', 'C', 'D']:`
			`return temp`
			`else:`
			`return None`


			`def match_GPT4_answer(s):`
			`if result := re.findall('(?:选择：\|Choice: )([ABCD])', s):`
			`return result[0]`
			`else:`
			`return None`


			`@ICL_EVALUATORS.register_module()`
			`class Corev2Evaluator(BaseEvaluator):`

			`def __init__(self,`
			`base_model,`
			`compare_model,`
			`judge_method='gpt4',`
			`metric='win_rate'):`
			`self.base_model = base_model`
			`self.compare_model = compare_model`
			`self.metric = metric`
			`self.judge_method = judge_method`

			`def score(self, predictions, references):`
			`if self.judge_method == 'gpt4':`
			`predictions = [match_GPT4_answer(s) for s in predictions]`
			`else:`
			`predictions = [match_general_answer(s) for s in predictions]`
			`print(`
			`f'Among {len(predictions)} judgements, successfully extracted {len(predictions)-predictions.count(None)} judgements.'`
			`)`
			`win_both, half_draw, categories = defaultdict(float), defaultdict(`
			`float), defaultdict(float)`
			`for prediction, reference in zip(predictions, references):`
			`if prediction is not None:`
			`categories[reference['capability'].split('-')[0]] += 1`
			`winner = ''`
			`if prediction == 'A':`
			`winner = reference['model1']`
			`elif prediction == 'B':`
			`winner = reference['model2']`
			`elif prediction == 'C':`
			`win_both[reference['capability'].split('-')[0]] += 1`
			`if self.base_model == winner:`
			`half_draw[reference['capability'].split('-')[0]] += 1`
			`win_both[reference['capability'].split('-')[0]] += 1`
			`for capability in categories:`
			`if capability not in half_draw:`
			`win_both[capability] = 0.0`
			`half_draw[capability] = 0.0`
			`else:`
			`win_both[capability] = round(`
			`(win_both[capability] / categories[capability]) * 100, 2)`
			`half_draw[capability] = round(`
			`(half_draw[capability] / categories[capability]) * 100, 2)`
			`scores = {'win_both': win_both, 'half_draw': half_draw}`
			`return scores`