OpenCompass/opencompass/datasets/IFEval/evaluation_main.py

# flake8: noqa
# yapf: disable

# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
from typing import Dict, Optional, Union

from absl import flags

import opencompass.datasets.IFEval.instructions_registry as instructions_registry

_INPUT_DATA = flags.DEFINE_string('input_data',
                                  None,
                                  'path to input data',
                                  required=True)

_INPUT_RESPONSE_DATA = flags.DEFINE_string('input_response_data',
                                           None,
                                           'path to input response data',
                                           required=False)

_OUTPUT_DIR = flags.DEFINE_string(
    'output_dir',
    None,
    'Output directory for inference and eval results.',
    required=True,
)


@dataclasses.dataclass
class InputExample:
    key: int
    instruction_id_list: list[str]
    prompt: str
    kwargs: list[Dict[str, Optional[Union[str, int]]]]


@dataclasses.dataclass
class OutputExample:
    instruction_id_list: list[str]
    prompt: str
    response: str
    follow_all_instructions: bool
    follow_instruction_list: list[bool]


def test_instruction_following_strict(
    inp,
    response,
):
    """Tests response to see if instrutions are followed."""
    instruction_list = inp.instruction_id_list
    is_following_list = []

    for index, instruction_id in enumerate(instruction_list):
        instruction_cls = instructions_registry.INSTRUCTION_DICT[
            instruction_id]
        instruction = instruction_cls(instruction_id)
        instruction.build_description(**inp.kwargs[index])
        args = instruction.get_instruction_args()
        if args and 'prompt' in args:
            instruction.build_description(prompt=inp.prompt)

        if response.strip() and instruction.check_following(response):
            is_following_list.append(True)
        else:
            is_following_list.append(False)

    return OutputExample(
        instruction_id_list=inp.instruction_id_list,
        prompt=inp.prompt,
        response=response,
        follow_all_instructions=all(is_following_list),
        follow_instruction_list=is_following_list,
    )


def test_instruction_following_loose(
    inp,
    response,
):
    """Tests response for an upper bound for following instructions."""
    r = response.split('\n')
    response_remove_first = '\n'.join(r[1:]).strip()
    response_remove_last = '\n'.join(r[:-1]).strip()
    response_remove_both = '\n'.join(r[1:-1]).strip()
    revised_response = response.replace('*', '')
    revised_response_remove_first = response_remove_first.replace('*', '')
    revised_response_remove_last = response_remove_last.replace('*', '')
    revised_response_remove_both = response_remove_both.replace('*', '')
    all_responses = [
        response,
        revised_response,
        response_remove_first,
        response_remove_last,
        response_remove_both,
        revised_response_remove_first,
        revised_response_remove_last,
        revised_response_remove_both,
    ]
    instruction_list = inp.instruction_id_list
    is_following_list = []

    for index, instruction_id in enumerate(instruction_list):
        instruction_cls = instructions_registry.INSTRUCTION_DICT[
            instruction_id]
        instruction = instruction_cls(instruction_id)

        instruction.build_description(**inp.kwargs[index])
        args = instruction.get_instruction_args()
        if args and 'prompt' in args:
            instruction.build_description(prompt=inp.prompt)

        is_following = False
        for r in all_responses:
            if r.strip() and instruction.check_following(r):
                is_following = True
                break

        is_following_list.append(is_following)

    return OutputExample(
        instruction_id_list=inp.instruction_id_list,
        prompt=inp.prompt,
        response=response,
        follow_all_instructions=all(is_following_list),
        follow_instruction_list=is_following_list,
    )
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`# flake8: noqa`
			`# yapf: disable`

			`# Copyright 2023 The Google Research Authors.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import dataclasses`
			`from typing import Dict, Optional, Union`

			`from absl import flags`

[Fix] Fix IFEval (#906) * fix ifeval * fix ifeval * fix ifeval * fix ifeval 2024-02-22 16:51:34 +08:00			`import opencompass.datasets.IFEval.instructions_registry as instructions_registry`
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00
			`_INPUT_DATA = flags.DEFINE_string('input_data',`
			`None,`
			`'path to input data',`
			`required=True)`

			`_INPUT_RESPONSE_DATA = flags.DEFINE_string('input_response_data',`
			`None,`
			`'path to input response data',`
			`required=False)`

			`_OUTPUT_DIR = flags.DEFINE_string(`
			`'output_dir',`
			`None,`
			`'Output directory for inference and eval results.',`
			`required=True,`
			`)`


			`@dataclasses.dataclass`
			`class InputExample:`
			`key: int`
			`instruction_id_list: list[str]`
			`prompt: str`
			`kwargs: list[Dict[str, Optional[Union[str, int]]]]`


			`@dataclasses.dataclass`
			`class OutputExample:`
			`instruction_id_list: list[str]`
			`prompt: str`
			`response: str`
			`follow_all_instructions: bool`
			`follow_instruction_list: list[bool]`


			`def test_instruction_following_strict(`
			`inp,`
			`response,`
			`):`
			`"""Tests response to see if instrutions are followed."""`
			`instruction_list = inp.instruction_id_list`
			`is_following_list = []`

			`for index, instruction_id in enumerate(instruction_list):`
			`instruction_cls = instructions_registry.INSTRUCTION_DICT[`
			`instruction_id]`
			`instruction = instruction_cls(instruction_id)`
			`instruction.build_description(**inp.kwargs[index])`
			`args = instruction.get_instruction_args()`
			`if args and 'prompt' in args:`
			`instruction.build_description(prompt=inp.prompt)`

			`if response.strip() and instruction.check_following(response):`
			`is_following_list.append(True)`
			`else:`
			`is_following_list.append(False)`

			`return OutputExample(`
			`instruction_id_list=inp.instruction_id_list,`
			`prompt=inp.prompt,`
			`response=response,`
			`follow_all_instructions=all(is_following_list),`
			`follow_instruction_list=is_following_list,`
			`)`


			`def test_instruction_following_loose(`
			`inp,`
			`response,`
			`):`
			`"""Tests response for an upper bound for following instructions."""`
			`r = response.split('\n')`
			`response_remove_first = '\n'.join(r[1:]).strip()`
			`response_remove_last = '\n'.join(r[:-1]).strip()`
			`response_remove_both = '\n'.join(r[1:-1]).strip()`
			`revised_response = response.replace('*', '')`
			`revised_response_remove_first = response_remove_first.replace('*', '')`
			`revised_response_remove_last = response_remove_last.replace('*', '')`
			`revised_response_remove_both = response_remove_both.replace('*', '')`
			`all_responses = [`
			`response,`
			`revised_response,`
			`response_remove_first,`
			`response_remove_last,`
			`response_remove_both,`
			`revised_response_remove_first,`
			`revised_response_remove_last,`
			`revised_response_remove_both,`
			`]`
			`instruction_list = inp.instruction_id_list`
			`is_following_list = []`

			`for index, instruction_id in enumerate(instruction_list):`
			`instruction_cls = instructions_registry.INSTRUCTION_DICT[`
			`instruction_id]`
			`instruction = instruction_cls(instruction_id)`

			`instruction.build_description(**inp.kwargs[index])`
			`args = instruction.get_instruction_args()`
			`if args and 'prompt' in args:`
			`instruction.build_description(prompt=inp.prompt)`

			`is_following = False`
			`for r in all_responses:`
			`if r.strip() and instruction.check_following(r):`
			`is_following = True`
			`break`

			`is_following_list.append(is_following)`

			`return OutputExample(`
			`instruction_id_list=inp.instruction_id_list,`
			`prompt=inp.prompt,`
			`response=response,`
			`follow_all_instructions=all(is_following_list),`
			`follow_instruction_list=is_following_list,`
			`)`