OpenCompass/opencompass/datasets/subjective/fofo.py

# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict

from datasets import Dataset

from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference


@LOAD_DATASET.register_module()
class FofoDataset(BaseDataset):

    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.json')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for problem in json_data:
                question = problem['instruction']
                lan = 'cn' if 'cn' in name else 'en'
                raw_data.append({
                    'question': question,
                    'judge': {
                        'lan': lan,
                        'id': problem['id'],
                        'domain': problem['domain'],
                        'sub_domain': problem['sub_domain'],
                        'format': problem['format'],
                        'format_type': problem['format_type'],
                        'question': question
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset


def post_process_fofo(judgement: dict):
    """Input a string like below:

    xxx[[5]]xxx, and extract the score
    """
    match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)",
                      judgement['prediction'])
    if match:
        score = int(match.group(1))
    else:
        return None

    return {'score': score}


@DICT_POSTPROCESSORS.register_module('fofo')
def fofo_postprocess(output: dict, output_path: str) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_fofo)

    if len(judged_answers) == 0:
        scores = None

    scores = defaultdict(list)
    for ans, ref in zip(judged_answers, references):
        domain = ref['domain']
        format_name = ref['format']
        format_type = ref['format_type']
        score = ans['score']
        if score is not None:
            scores['overall'].append(score)
            scores[domain].append(score)
            if format_type == 'general':
                scores[format_name].append(score)
    single_model_scores = {
        task: sum(score) / len(score)
        for task, score in scores.items()
    }
    results = single_model_scores
    results['details'] = output
    return results
[Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo 2024-06-06 11:40:48 +08:00			`# flake8: noqa`
			`import json`
			`import os.path as osp`
[Feature] Add Judgerbench and reorg subeval (#1593) * fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com> 2024-10-15 16:36:05 +08:00			`import re`
			`from collections import defaultdict`
[Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo 2024-06-06 11:40:48 +08:00
			`from datasets import Dataset`

[Feature] Add Judgerbench and reorg subeval (#1593) * fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com> 2024-10-15 16:36:05 +08:00			`from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET`
[Update] Support auto-download of FOFO/MT-Bench-101 (#1423) * [Update] Support auto-download of FOFO/MT-Bench-101 * Update wildbench 2024-08-16 11:57:41 +08:00			`from opencompass.utils import get_data_path`
[Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo 2024-06-06 11:40:48 +08:00
			`from ..base import BaseDataset`
[Feature] Add Judgerbench and reorg subeval (#1593) * fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com> 2024-10-15 16:36:05 +08:00			`from .utils import get_judgeanswer_and_reference`
[Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo 2024-06-06 11:40:48 +08:00

			`@LOAD_DATASET.register_module()`
			`class FofoDataset(BaseDataset):`

[Refactor] Reorganize subjective eval (#1284) * fix pip version * fix pip version * reorganize subjective eval * reorg sub * reorg subeval * reorg subeval * update subjective doc * reorg subeval * reorg subeval 2024-07-05 22:11:37 +08:00			`def load(self, path: str, name: str, args, *kwargs):`
[Update] Support auto-download of FOFO/MT-Bench-101 (#1423) * [Update] Support auto-download of FOFO/MT-Bench-101 * Update wildbench 2024-08-16 11:57:41 +08:00			`path = get_data_path(path, local_mode=True)`
[Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo 2024-06-06 11:40:48 +08:00			`filename = osp.join(path, f'{name}.json')`
			`raw_data = []`
			`with open(filename, 'r', encoding='utf-8') as f:`
			`json_data = json.load(f)`
			`for problem in json_data:`
			`question = problem['instruction']`
			`lan = 'cn' if 'cn' in name else 'en'`
			`raw_data.append({`
			`'question': question,`
			`'judge': {`
			`'lan': lan,`
			`'id': problem['id'],`
			`'domain': problem['domain'],`
			`'sub_domain': problem['sub_domain'],`
			`'format': problem['format'],`
			`'format_type': problem['format_type'],`
			`'question': question`
			`}`
			`})`
			`dataset = Dataset.from_list(raw_data)`
			`return dataset`
[Feature] Add Judgerbench and reorg subeval (#1593) * fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com> 2024-10-15 16:36:05 +08:00

			`def post_process_fofo(judgement: dict):`
			`"""Input a string like below:`

			`xxx[[5]]xxx, and extract the score`
			`"""`
			`match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)",`
			`judgement['prediction'])`
			`if match:`
			`score = int(match.group(1))`
			`else:`
			`return None`

			`return {'score': score}`


			`@DICT_POSTPROCESSORS.register_module('fofo')`
			`def fofo_postprocess(output: dict, output_path: str) -> dict:`
			`judged_answers, references = get_judgeanswer_and_reference(`
			`output, output_path, post_process_fofo)`

			`if len(judged_answers) == 0:`
			`scores = None`

			`scores = defaultdict(list)`
			`for ans, ref in zip(judged_answers, references):`
			`domain = ref['domain']`
			`format_name = ref['format']`
			`format_type = ref['format_type']`
			`score = ans['score']`
			`if score is not None:`
			`scores['overall'].append(score)`
			`scores[domain].append(score)`
			`if format_type == 'general':`
			`scores[format_name].append(score)`
			`single_model_scores = {`
			`task: sum(score) / len(score)`
			`for task, score in scores.items()`
			`}`
			`results = single_model_scores`
			`results['details'] = output`
			`return results`