mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* fix pip version * fix pip version * update (#1522) Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> * [Feature] Update Models (#1518) * Update Models * Update * Update humanevalx * Update * Update * [Feature] Dataset prompts update for ARC, BoolQ, Race (#1527) add judgerbench and reorg sub add judgerbench and reorg subeval add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval * add judgerbench and reorg subeval --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn> Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
85 lines
2.6 KiB
Python
85 lines
2.6 KiB
Python
# flake8: noqa
|
|
import json
|
|
import os.path as osp
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
from datasets import Dataset
|
|
|
|
from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
|
|
from opencompass.utils import get_data_path
|
|
|
|
from ..base import BaseDataset
|
|
from .utils import get_judgeanswer_and_reference
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class FofoDataset(BaseDataset):
|
|
|
|
def load(self, path: str, name: str, *args, **kwargs):
|
|
path = get_data_path(path, local_mode=True)
|
|
filename = osp.join(path, f'{name}.json')
|
|
raw_data = []
|
|
with open(filename, 'r', encoding='utf-8') as f:
|
|
json_data = json.load(f)
|
|
for problem in json_data:
|
|
question = problem['instruction']
|
|
lan = 'cn' if 'cn' in name else 'en'
|
|
raw_data.append({
|
|
'question': question,
|
|
'judge': {
|
|
'lan': lan,
|
|
'id': problem['id'],
|
|
'domain': problem['domain'],
|
|
'sub_domain': problem['sub_domain'],
|
|
'format': problem['format'],
|
|
'format_type': problem['format_type'],
|
|
'question': question
|
|
}
|
|
})
|
|
dataset = Dataset.from_list(raw_data)
|
|
return dataset
|
|
|
|
|
|
def post_process_fofo(judgement: dict):
|
|
"""Input a string like below:
|
|
|
|
xxx[[5]]xxx, and extract the score
|
|
"""
|
|
match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)",
|
|
judgement['prediction'])
|
|
if match:
|
|
score = int(match.group(1))
|
|
else:
|
|
return None
|
|
|
|
return {'score': score}
|
|
|
|
|
|
@DICT_POSTPROCESSORS.register_module('fofo')
|
|
def fofo_postprocess(output: dict, output_path: str) -> dict:
|
|
judged_answers, references = get_judgeanswer_and_reference(
|
|
output, output_path, post_process_fofo)
|
|
|
|
if len(judged_answers) == 0:
|
|
scores = None
|
|
|
|
scores = defaultdict(list)
|
|
for ans, ref in zip(judged_answers, references):
|
|
domain = ref['domain']
|
|
format_name = ref['format']
|
|
format_type = ref['format_type']
|
|
score = ans['score']
|
|
if score is not None:
|
|
scores['overall'].append(score)
|
|
scores[domain].append(score)
|
|
if format_type == 'general':
|
|
scores[format_name].append(score)
|
|
single_model_scores = {
|
|
task: sum(score) / len(score)
|
|
for task, score in scores.items()
|
|
}
|
|
results = single_model_scores
|
|
results['details'] = output
|
|
return results
|