OpenCompass/opencompass/datasets/internsandbox.py

import importlib
import json
import os.path as osp

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path

from .base import BaseDataset


@LOAD_DATASET.register_module()
class InternSandboxDataset(BaseDataset):

    @staticmethod
    def load(path: str, sandbox: str, local_mode: bool = False):
        path = get_data_path(path, local_mode=local_mode)
        file_path = osp.join(path, f'{sandbox}.jsonl')
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                origin_data = json.loads(line)
                origin_data['ground_truth'] = json.dumps(
                    origin_data['ground_truth'])
                data.append(origin_data)
        return Dataset.from_list(data)


@ICL_EVALUATORS.register_module()
class InternSandboxEvaluator(BaseEvaluator):

    def __init__(self,
                 short_penalty: bool = False,
                 format_penalty: bool = False):
        super().__init__()
        self.short_penalty = short_penalty
        self.format_penalty = format_penalty

    def score(self, predictions, references, test_set):

        if len(predictions) != len(references):
            return {
                'error':
                'predictions and references have different '
                f'length. len(predictions): {len(predictions)}, '
                f'len(references): {len(references)}'
            }

        class_name = f"{test_set[0]['data_source']}Sandbox"

        details = []
        for pred, ref, ts in zip(predictions, references, test_set):
            ref = json.loads(ref)
            module = importlib.import_module('intern_sandbox')
            score = getattr(module, class_name).verify_score(
                pred,
                ref,
                short_penalty=self.short_penalty,
                format_penalty=self.format_penalty)
            try:
                extracted = getattr(module, class_name).extract_output(pred)
            except:  # noqa: E722
                extracted = None

            res = {
                'prompt': ts['prompt'],
                'score': score,
                'extracted_output': extracted,
                'ground_truth': ref,
                'output': pred,
            }
            details.append(res)

        avg_score = sum(r['score'] for r in details) / len(details)
        results = {'accuracy': avg_score, 'details': details}
        return results
[Feature] Support InternSandbox (#2049) * internsandbox init * internsandbox * dataset_index * dataset_index_add 2025-05-07 16:42:09 +08:00			`import importlib`
			`import json`
			`import os.path as osp`

			`from datasets import Dataset`

			`from opencompass.openicl.icl_evaluator import BaseEvaluator`
			`from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET`
			`from opencompass.utils import get_data_path`

			`from .base import BaseDataset`


			`@LOAD_DATASET.register_module()`
			`class InternSandboxDataset(BaseDataset):`

			`@staticmethod`
			`def load(path: str, sandbox: str, local_mode: bool = False):`
			`path = get_data_path(path, local_mode=local_mode)`
			`file_path = osp.join(path, f'{sandbox}.jsonl')`
			`data = []`
			`with open(file_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`origin_data = json.loads(line)`
			`origin_data['ground_truth'] = json.dumps(`
			`origin_data['ground_truth'])`
			`data.append(origin_data)`
			`return Dataset.from_list(data)`


			`@ICL_EVALUATORS.register_module()`
			`class InternSandboxEvaluator(BaseEvaluator):`

			`def __init__(self,`
			`short_penalty: bool = False,`
			`format_penalty: bool = False):`
			`super().__init__()`
			`self.short_penalty = short_penalty`
			`self.format_penalty = format_penalty`

			`def score(self, predictions, references, test_set):`

			`if len(predictions) != len(references):`
			`return {`
			`'error':`
			`'predictions and references have different '`
			`f'length. len(predictions): {len(predictions)}, '`
			`f'len(references): {len(references)}'`
			`}`

			`class_name = f"{test_set[0]['data_source']}Sandbox"`

			`details = []`
			`for pred, ref, ts in zip(predictions, references, test_set):`
			`ref = json.loads(ref)`
			`module = importlib.import_module('intern_sandbox')`
			`score = getattr(module, class_name).verify_score(`
			`pred,`
			`ref,`
			`short_penalty=self.short_penalty,`
			`format_penalty=self.format_penalty)`
			`try:`
			`extracted = getattr(module, class_name).extract_output(pred)`
			`except: # noqa: E722`
			`extracted = None`

			`res = {`
			`'prompt': ts['prompt'],`
			`'score': score,`
			`'extracted_output': extracted,`
			`'ground_truth': ref,`
			`'output': pred,`
			`}`
			`details.append(res)`

			`avg_score = sum(r['score'] for r in details) / len(details)`
			`results = {'accuracy': avg_score, 'details': details}`
			`return results`