OpenCompass/opencompass/datasets/subjective/fofo.py
bittersweet1999 fa54aa62f6
[Feature] Add Judgerbench and reorg subeval (#1593)
* fix pip version

* fix pip version

* update (#1522)

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>

* [Feature] Update Models (#1518)

* Update Models

* Update

* Update humanevalx

* Update

* Update

* [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527)

add judgerbench and reorg sub

add judgerbench and reorg subeval

add judgerbench and reorg subeval

* add judgerbench and reorg subeval

* add judgerbench and reorg subeval

* add judgerbench and reorg subeval

* add judgerbench and reorg subeval

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com>
Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
2024-10-15 16:36:05 +08:00

85 lines
2.6 KiB
Python

# flake8: noqa
import json
import os.path as osp
import re
from collections import defaultdict
from datasets import Dataset
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
from .utils import get_judgeanswer_and_reference
@LOAD_DATASET.register_module()
class FofoDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for problem in json_data:
question = problem['instruction']
lan = 'cn' if 'cn' in name else 'en'
raw_data.append({
'question': question,
'judge': {
'lan': lan,
'id': problem['id'],
'domain': problem['domain'],
'sub_domain': problem['sub_domain'],
'format': problem['format'],
'format_type': problem['format_type'],
'question': question
}
})
dataset = Dataset.from_list(raw_data)
return dataset
def post_process_fofo(judgement: dict):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)",
judgement['prediction'])
if match:
score = int(match.group(1))
else:
return None
return {'score': score}
@DICT_POSTPROCESSORS.register_module('fofo')
def fofo_postprocess(output: dict, output_path: str) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process_fofo)
if len(judged_answers) == 0:
scores = None
scores = defaultdict(list)
for ans, ref in zip(judged_answers, references):
domain = ref['domain']
format_name = ref['format']
format_type = ref['format_type']
score = ans['score']
if score is not None:
scores['overall'].append(score)
scores[domain].append(score)
if format_type == 'general':
scores[format_name].append(score)
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
results = single_model_scores
results['details'] = output
return results