OpenCompass/opencompass/datasets/subject.py

# flake8: noqa: E501
import json
import random

from datasets import Dataset, DatasetDict

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class SubInferDataset(BaseDataset):

    @staticmethod
    def load(path: str):
        dataset = DatasetDict()
        raw_data = []
        with open(path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for problem in json_data:
                question = problem['question']
                reference_answer = problem['reference_answer']
                evaluating_guidance = problem['evaluating_guidance']
                capability = problem['capability']
                raw_data.append({
                    'question': question,
                    'judge': {
                        'question': question,
                        'reference_answer': reference_answer,
                        'evaluating_guidance': evaluating_guidance,
                        'capability': capability
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset


@LOAD_DATASET.register_module()
class SubJudgeDataset(BaseDataset):

    @staticmethod
    def load(
        path: str,
        model1: str,
        path2: str,
        model2: str,
        mode='compare',
        random_order=True,
        random_seed=0,
    ):
        dataset = DatasetDict()
        raw_data = []
        if mode == 'compare':
            with open(path, 'r', encoding='utf-8') as f:
                json_data1 = json.load(f)
            with open(path2, 'r', encoding='utf-8') as f:
                json_data2 = json.load(f)
            random_generate = random.Random(random_seed)
            same_flag = 0
            for idx in json_data1:
                problem = json_data1[idx]
                answer1 = json_data1[idx]['prediction']
                answer2 = json_data2[idx]['prediction']
                if answer1 == answer2:
                    same_flag += 1
                    continue
                item = {}
                item['question'] = problem['gold']['question']
                item['reference_answer'] = problem['gold']['reference_answer']
                item['evaluating_guidance'] = problem['gold'][
                    'evaluating_guidance']
                item['capability'] = problem['gold']['capability']
                if random_order:
                    if random_generate.randint(0, 1) == 0:
                        item['answer1'] = answer1
                        item['model1'] = model1
                        item['answer2'] = answer2
                        item['model2'] = model2
                    else:
                        item['answer1'] = answer2
                        item['model1'] = model2
                        item['answer2'] = answer1
                        item['model2'] = model1
                else:
                    item['answer1'] = answer1
                    item['model1'] = model1
                    item['answer2'] = answer2
                    item['model2'] = model2
                raw_data.append({
                    'question':
                    item['question'],
                    'reference_answer':
                    item['reference_answer'],
                    'evaluating_guidance':
                    item['evaluating_guidance'],
                    'capability':
                    item['capability'],
                    'answer1':
                    item['answer1'],
                    'answer2':
                    item['answer2'],
                    'judge': {
                        'capability': item['capability'],
                        'model1': item['model1'],
                        'model2': item['model2']
                    }
                })
            if same_flag != 0:
                print(
                    f'Among {len(json_data1)} comparisons, {same_flag} cases are exact match, which will be skipped. '
                )
        elif mode == 'score':
            pass
        dataset = Dataset.from_list(raw_data)
        return dataset