OpenCompass/opencompass/datasets/internsandbox.py
Dongsheng Zhu ba0e32292c
[Feature] Support InternSandbox (#2049)
* internsandbox init

* internsandbox

* dataset_index

* dataset_index_add
2025-05-07 16:42:09 +08:00

79 lines
2.5 KiB
Python

import importlib
import json
import os.path as osp
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@LOAD_DATASET.register_module()
class InternSandboxDataset(BaseDataset):
@staticmethod
def load(path: str, sandbox: str, local_mode: bool = False):
path = get_data_path(path, local_mode=local_mode)
file_path = osp.join(path, f'{sandbox}.jsonl')
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
origin_data = json.loads(line)
origin_data['ground_truth'] = json.dumps(
origin_data['ground_truth'])
data.append(origin_data)
return Dataset.from_list(data)
@ICL_EVALUATORS.register_module()
class InternSandboxEvaluator(BaseEvaluator):
def __init__(self,
short_penalty: bool = False,
format_penalty: bool = False):
super().__init__()
self.short_penalty = short_penalty
self.format_penalty = format_penalty
def score(self, predictions, references, test_set):
if len(predictions) != len(references):
return {
'error':
'predictions and references have different '
f'length. len(predictions): {len(predictions)}, '
f'len(references): {len(references)}'
}
class_name = f"{test_set[0]['data_source']}Sandbox"
details = []
for pred, ref, ts in zip(predictions, references, test_set):
ref = json.loads(ref)
module = importlib.import_module('intern_sandbox')
score = getattr(module, class_name).verify_score(
pred,
ref,
short_penalty=self.short_penalty,
format_penalty=self.format_penalty)
try:
extracted = getattr(module, class_name).extract_output(pred)
except: # noqa: E722
extracted = None
res = {
'prompt': ts['prompt'],
'score': score,
'extracted_output': extracted,
'ground_truth': ref,
'output': pred,
}
details.append(res)
avg_score = sum(r['score'] for r in details) / len(details)
results = {'accuracy': avg_score, 'details': details}
return results