OpenCompass/opencompass/datasets/IFEval/ifeval.py

import json

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset
from .evaluation_main import (InputExample, test_instruction_following_loose,
                              test_instruction_following_strict)


@LOAD_DATASET.register_module()
class IFEvalDataset(BaseDataset):

    @staticmethod
    def load(path):
        path = get_data_path(path, local_mode=True)
        datasets = []
        with open(path, 'r', encoding='utf-8') as file:
            for line in file:
                tmp = json.loads(line.strip())
                dataset = dict(prompt=tmp['prompt'], reference=tmp)
                datasets.append(dataset)
        return Dataset.from_list(datasets)


class IFEvaluator(BaseEvaluator):

    def score(self, predictions, references, origin_prompt):
        prompt_strict_correct, prompt_strict_total = 0, 0
        inst_strict_correct, inst_strict_total = 0, 0
        prompt_loose_correct, prompt_loose_total = 0, 0
        inst_loose_correct, inst_loose_total = 0, 0
        details = {}
        for index, (pred, refer) in enumerate(zip(predictions, references)):
            input = InputExample(
                key=refer['key'],
                instruction_id_list=refer['instruction_id_list'],
                prompt=refer['prompt'],
                kwargs=refer['kwargs'])
            for kwarg in input.kwargs:
                for k in list(kwarg.keys()):
                    if kwarg[k] is None:
                        kwarg.pop(k, None)

            # strict
            example = test_instruction_following_strict(input, pred)
            follow_instruction_list = example.follow_instruction_list
            instruction_id_list = example.instruction_id_list
            prompt_strict_total += 1
            is_strict_correct = all(follow_instruction_list)
            prompt_strict_correct += is_strict_correct
            inst_strict_total += len(instruction_id_list)
            inst_strict_correct += sum(follow_instruction_list)

            # loose
            example = test_instruction_following_loose(input, pred)
            follow_instruction_list = example.follow_instruction_list
            instruction_id_list = example.instruction_id_list
            prompt_loose_total += 1
            is_loose_correct = all(follow_instruction_list)
            prompt_loose_correct += is_loose_correct
            inst_loose_total += len(instruction_id_list)
            inst_loose_correct += sum(follow_instruction_list)

            if is_strict_correct:
                grade = 'strict'
            elif is_loose_correct:
                grade = 'loose'
            else:
                grade = 'none'

            details[str(index)] = {
                'prompt': origin_prompt[index],
                'pred': pred,
                'refer': refer,
                'is_strict_correct': is_strict_correct,
                'is_loose_correct': is_loose_correct,
                'is_correct': is_strict_correct,
                'grade': grade
            }

        results = {
            'Prompt-level-strict-accuracy':
            prompt_strict_correct / prompt_strict_total * 100,
            'Inst-level-strict-accuracy':
            inst_strict_correct / inst_strict_total * 100,
            'Prompt-level-loose-accuracy':
            prompt_loose_correct / prompt_loose_total * 100,
            'Inst-level-loose-accuracy':
            inst_loose_correct / inst_loose_total * 100,
            'details':
            details
        }
        return results
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`import json`

			`from datasets import Dataset`

			`from opencompass.openicl.icl_evaluator import BaseEvaluator`
			`from opencompass.registry import LOAD_DATASET`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`from opencompass.utils import get_data_path`
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00
			`from ..base import BaseDataset`
			`from .evaluation_main import (InputExample, test_instruction_following_loose,`
			`test_instruction_following_strict)`


			`@LOAD_DATASET.register_module()`
			`class IFEvalDataset(BaseDataset):`

			`@staticmethod`
			`def load(path):`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`path = get_data_path(path, local_mode=True)`
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`datasets = []`
			`with open(path, 'r', encoding='utf-8') as file:`
			`for line in file:`
			`tmp = json.loads(line.strip())`
			`dataset = dict(prompt=tmp['prompt'], reference=tmp)`
			`datasets.append(dataset)`
			`return Dataset.from_list(datasets)`


			`class IFEvaluator(BaseEvaluator):`

[Sync] update evaluator (#1175) 2024-05-21 14:22:46 +08:00			`def score(self, predictions, references, origin_prompt):`
			`prompt_strict_correct, prompt_strict_total = 0, 0`
			`inst_strict_correct, inst_strict_total = 0, 0`
			`prompt_loose_correct, prompt_loose_total = 0, 0`
			`inst_loose_correct, inst_loose_total = 0, 0`
			`details = {}`
			`for index, (pred, refer) in enumerate(zip(predictions, references)):`
[Feature] Add IFEval (#813) * [Feature] Add IFEval * [Doc] add introduction of IFEval 2024-01-23 20:07:49 +08:00			`input = InputExample(`
			`key=refer['key'],`
			`instruction_id_list=refer['instruction_id_list'],`
			`prompt=refer['prompt'],`
			`kwargs=refer['kwargs'])`
			`for kwarg in input.kwargs:`
			`for k in list(kwarg.keys()):`
			`if kwarg[k] is None:`
			`kwarg.pop(k, None)`
[Fix] fix ifeval (#909) 2024-02-23 16:52:03 +08:00
[Sync] update evaluator (#1175) 2024-05-21 14:22:46 +08:00			`# strict`
			`example = test_instruction_following_strict(input, pred)`
			`follow_instruction_list = example.follow_instruction_list`
			`instruction_id_list = example.instruction_id_list`
			`prompt_strict_total += 1`
			`is_strict_correct = all(follow_instruction_list)`
			`prompt_strict_correct += is_strict_correct`
			`inst_strict_total += len(instruction_id_list)`
			`inst_strict_correct += sum(follow_instruction_list)`
[Fix] fix ifeval (#909) 2024-02-23 16:52:03 +08:00
[Sync] update evaluator (#1175) 2024-05-21 14:22:46 +08:00			`# loose`
			`example = test_instruction_following_loose(input, pred)`
			`follow_instruction_list = example.follow_instruction_list`
			`instruction_id_list = example.instruction_id_list`
			`prompt_loose_total += 1`
			`is_loose_correct = all(follow_instruction_list)`
			`prompt_loose_correct += is_loose_correct`
			`inst_loose_total += len(instruction_id_list)`
			`inst_loose_correct += sum(follow_instruction_list)`
[Fix] fix ifeval (#909) 2024-02-23 16:52:03 +08:00
[Sync] update evaluator (#1175) 2024-05-21 14:22:46 +08:00			`if is_strict_correct:`
			`grade = 'strict'`
			`elif is_loose_correct:`
			`grade = 'loose'`
			`else:`
			`grade = 'none'`

			`details[str(index)] = {`
			`'prompt': origin_prompt[index],`
			`'pred': pred,`
			`'refer': refer,`
			`'is_strict_correct': is_strict_correct,`
			`'is_loose_correct': is_loose_correct,`
			`'is_correct': is_strict_correct,`
			`'grade': grade`
			`}`

			`results = {`
			`'Prompt-level-strict-accuracy':`
			`prompt_strict_correct / prompt_strict_total * 100,`
			`'Inst-level-strict-accuracy':`
			`inst_strict_correct / inst_strict_total * 100,`
			`'Prompt-level-loose-accuracy':`
			`prompt_loose_correct / prompt_loose_total * 100,`
			`'Inst-level-loose-accuracy':`
			`inst_loose_correct / inst_loose_total * 100,`
			`'details':`
			`details`
			`}`
			`return results`