OpenCompass/opencompass/datasets/subjective/compass_arena.py

# flake8: noqa: E501
import re
from collections import defaultdict

from datasets import Dataset

from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET

from .subjective_cmp import SubjectiveCmpDataset
from .utils import get_judgeanswer_and_reference


@LOAD_DATASET.register_module()
class CompassArenaDataset(SubjectiveCmpDataset):

    def load(self, path: str, name: str, *args, **kwargs):
        dataset = list(super().load(path, name))
        creation_dataset = []
        for data in dataset:
            if 'reference' in data['others']:
                if data['others']['reference'] is not None:
                    data['ref'] = data['others']['reference']
                else:
                    data['ref'] = '满足用户需求，言之有理即可'
            else:
                data['ref'] = '满足用户需求，言之有理即可'
            creation_dataset.append(data)
        dataset = Dataset.from_list(creation_dataset)
        return dataset


def check_position_bias(judged_answers, references, banned_choice=['C']):
    """Check position bias for judgellm's judgement.

    Args:
        judged_answers: The successfully extracted judgement.
        references: The references contains original question, which is used to located the same question for different position judgement.
    """
    position_bias_flag = 0
    position_bias_dict = {}
    for judge, ref in zip(judged_answers, references):
        question = ref['question']
        question_hash = hash(question)
        if question_hash not in position_bias_dict:
            position_bias_dict[question_hash] = {
                'question': question,
                'judge': judge
            }
        else:
            first_judge = position_bias_dict[question_hash]['judge']
            if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
                # If second choice is same with first choice, there has position bias.
                position_bias_flag += 1
    return position_bias_flag


def post_process_compassarena(item):
    s = item['prediction']
    if result := re.findall('(?:选择：|Choice: )([ABC])', s):
        return result[0]
    else:
        return None


@DICT_POSTPROCESSORS.register_module('compassarena')
def compassarena_postprocess(output: dict,
                             output_path: str,
                             summary_type='single',
                             check_pos_bias=True) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_compassarena)

    if check_pos_bias:
        bias_num = check_position_bias(judged_answers, references)
    else:
        bias_num = 0

    win_model1 = defaultdict(float)
    win_model2 = defaultdict(float)
    categories = defaultdict(float)
    model1 = references[0]['answer1']

    for prediction, reference in zip(judged_answers, references):

        categories[reference['capability']] += 1

        if prediction == 'A':
            if reference['answer1'] == model1:
                score_1, score_2 = 1, 0
            else:
                score_1, score_2 = 0, 1
        elif prediction == 'B':
            if reference['answer1'] == model1:
                score_1, score_2 = 0, 1
            else:
                score_1, score_2 = 1, 0
        elif prediction == 'C':
            if summary_type == 'half_add':
                score_1, score_2 = 0.5, 0.5
            else:
                score_1, score_2 = 0, 0

        win_model1[reference['capability']] += score_1
        win_model2[reference['capability']] += score_2
    for capability in categories:
        win_model1[
            capability] = win_model1[capability] / categories[capability] * 100
        win_model1[capability] = round(win_model1[capability], 2)
        win_model2[
            capability] = win_model2[capability] / categories[capability] * 100
        win_model2[capability] = round(win_model2[capability], 2)

    win_model1['position_bias'] = bias_num
    win_model2['position_bias'] = bias_num

    results = win_model2
    results['details'] = output
    return results