OpenCompass/opencompass/datasets/subjective/fofo.py

# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict

from datasets import Dataset

from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference


@LOAD_DATASET.register_module()
class FofoDataset(BaseDataset):

    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.json')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for problem in json_data:
                question = problem['instruction']
                lan = 'cn' if 'cn' in name else 'en'
                raw_data.append({
                    'question': question,
                    'judge': {
                        'lan': lan,
                        'id': problem['id'],
                        'domain': problem['domain'],
                        'sub_domain': problem['sub_domain'],
                        'format': problem['format'],
                        'format_type': problem['format_type'],
                        'question': question
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset


def post_process_fofo(judgement: dict):
    """Input a string like below:

    xxx[[5]]xxx, and extract the score
    """
    match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)",
                      judgement['prediction'])
    if match:
        score = int(match.group(1))
    else:
        return None

    return {'score': score}


@DICT_POSTPROCESSORS.register_module('fofo')
def fofo_postprocess(output: dict, output_path: str) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_fofo)

    if len(judged_answers) == 0:
        scores = None

    scores = defaultdict(list)
    for ans, ref in zip(judged_answers, references):
        domain = ref['domain']
        format_name = ref['format']
        format_type = ref['format_type']
        score = ans['score']
        if score is not None:
            scores['overall'].append(score)
            scores[domain].append(score)
            if format_type == 'general':
                scores[format_name].append(score)
    single_model_scores = {
        task: sum(score) / len(score)
        for task, score in scores.items()
    }
    results = single_model_scores
    results['details'] = output
    return results