OpenCompass/opencompass/datasets/ds1000_interpreter.py

from typing import List, Optional, Union

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET

from .ds1000 import DS1000Dataset


@LOAD_DATASET.register_module()
class DS1000Dataset_Interperter(DS1000Dataset):
    """Code interpreter version of DS1000."""

    def load(
        self,
        path: str,
        libs: Optional[Union[str, list]] = None,
        mode: str = 'Insertion',
    ):
        dataset = super().load(path, libs, mode)

        def preprocess(example):
            """Get rid of unnecessary code block in prompt."""
            prompt = example.pop('prompt')
            example['prompt'] = prompt[:prompt.find('A:\n')].strip()
            return example

        dataset = dataset.map(preprocess)
        return dataset


class DS1000InterpreterEvaluator(BaseEvaluator):
    """DS1000 interpreter evaluator.

    Args:
        action (str): Action for catching internal prediction.
            Defaults to `PythonInterpreter`.
    """

    def __init__(self, action: str = 'PythonInterpreter'):
        self.action = action

    def get_action(self, step):
        for s in step[::-1]:
            if s['type'] == self.action:
                return s

    def score(self, predictions: List, references: List, steps: List):
        """Calculate accuracy."""

        action_scope = 0
        follow_scope = 0
        soft_success = 0
        success = 0
        total = len(references)
        for step in steps:
            s = self.get_action(step)
            if s:
                action_scope += 1
                if not s['errmsg']:
                    soft_success += 1
                # assert must in code for testing
                # otherwise the result will be True
                if s['args'] and 'assert' in s['args']['text']:
                    follow_scope += 1
                    # successful result should count as passed
                    if s['result']:
                        success += s['result']['text'] == 'True'

        result = dict(
            action_pct=100 * action_scope / total,
            soft_code_acc=100 * soft_success / total,
            follow_acc=100 * follow_scope / total,
            code_acc=100 * success / total,
        )
        return result
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`from typing import List, Optional, Union`

			`from opencompass.openicl.icl_evaluator import BaseEvaluator`
			`from opencompass.registry import LOAD_DATASET`

			`from .ds1000 import DS1000Dataset`


			`@LOAD_DATASET.register_module()`
			`class DS1000Dataset_Interperter(DS1000Dataset):`
			`"""Code interpreter version of DS1000."""`

			`def load(`
			`self,`
			`path: str,`
			`libs: Optional[Union[str, list]] = None,`
			`mode: str = 'Insertion',`
			`):`
			`dataset = super().load(path, libs, mode)`

			`def preprocess(example):`
			`"""Get rid of unnecessary code block in prompt."""`
			`prompt = example.pop('prompt')`
			`example['prompt'] = prompt[:prompt.find('A:\n')].strip()`
			`return example`

			`dataset = dataset.map(preprocess)`
			`return dataset`


			`class DS1000InterpreterEvaluator(BaseEvaluator):`
			`"""DS1000 interpreter evaluator.`

			`Args:`
			`action (str): Action for catching internal prediction.`
			Defaults to `PythonInterpreter`.
			`"""`

			`def __init__(self, action: str = 'PythonInterpreter'):`
			`self.action = action`

			`def get_action(self, step):`
			`for s in step[::-1]:`
			`if s['type'] == self.action:`
			`return s`

			`def score(self, predictions: List, references: List, steps: List):`
			`"""Calculate accuracy."""`

			`action_scope = 0`
			`follow_scope = 0`
			`soft_success = 0`
			`success = 0`
			`total = len(references)`
			`for step in steps:`
			`s = self.get_action(step)`
			`if s:`
			`action_scope += 1`
			`if not s['errmsg']:`
			`soft_success += 1`
			`# assert must in code for testing`
			`# otherwise the result will be True`
			`if s['args'] and 'assert' in s['args']['text']:`
			`follow_scope += 1`
			`# successful result should count as passed`
			`if s['result']:`
			`success += s['result']['text'] == 'True'`

			`result = dict(`
			`action_pct=100 * action_scope / total,`
			`soft_code_acc=100 * soft_success / total,`
			`follow_acc=100 * follow_scope / total,`
			`code_acc=100 * success / total,`
			`)`
			`return result`