OpenCompass/opencompass/datasets/gsm8k.py

import json
import os
import re
from os import environ

from datasets import Dataset, DatasetDict

from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from opencompass.utils import get_data_path

from .base import BaseDataset


@LOAD_DATASET.register_module()
class GSM8KDataset(BaseDataset):

    @staticmethod
    def load(path):
        path = get_data_path(path)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
            dataset = MsDataset.load(dataset_name=path, trust_remote_code=True)
        else:
            datasets = {}
            for split in ['train', 'test']:
                split_path = os.path.join(path, split + '.jsonl')
                dataset = []
                with open(split_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = json.loads(line.strip())
                        dataset.append(line)
                datasets[split] = Dataset.from_list(dataset)
            dataset = DatasetDict(datasets)
        return dataset


@TEXT_POSTPROCESSORS.register_module('gsm8k_dataset')
def gsm8k_dataset_postprocess(text: str) -> str:
    return text.split('#### ')[1].replace(',', '')


@TEXT_POSTPROCESSORS.register_module('gsm8k')
def gsm8k_postprocess(text: str) -> str:
    text = text.split('Question:')[0]
    numbers = re.findall(r'\-?\d+\.\d+|\-?\d+', text)
    if not numbers:
        return 'NULL'
    return numbers[-1]


class Gsm8kEvaluator(BaseEvaluator):

    def is_equal(self, pred, refer):
        try:
            if pred == refer or abs(float(pred) - int(refer)) < 1e-6:
                return True
        except Exception:
            pass
        return False

    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        correct = 0
        count = 0
        details = []
        for i, j in zip(predictions, references):
            detail = {'pred': i, 'answer': j, 'correct': False}
            count += 1
            if self.is_equal(i, j):
                correct += 1
                detail['correct'] = True
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result


class Gsm8kAgentEvaluator(BaseEvaluator):
    """Gsm8k agent evaluator for soft condition.

    Args:
        action (str): Action for catching internal prediction.
            Defaults to `PythonInterpreter`.
    """

    def __init__(self, action: str = 'PythonInterpreter'):
        self.action = action

    def is_equal(self, pred, refer):
        try:
            if pred == refer or abs(float(pred) - int(refer)) < 1e-6:
                return True
        except Exception:
            pass
        return False

    def soft_equal(self, pred, refer, step):
        try:
            soft_pred = step['result']['text']
            if abs(float(soft_pred) - int(refer)) < 1e-6:
                return True
        except Exception:
            # result might not exists
            # text cannot convert to float
            pass
        return False

    def get_action(self, step):
        for s in step[::-1]:
            if s['type'] == self.action:
                return s

    def score(self, predictions, references, steps):
        """Calculate accuracy."""
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}

        row_reasoning_scope = 0
        action_scope = 0
        code_scope = 0
        reasoning_scope = 0
        final_scope = 0
        total = len(references)
        for pred, refer, step in zip(predictions, references, steps):
            # if final answer right
            if self.is_equal(pred, refer):
                if self.get_action(step):
                    final_scope += 1
                else:
                    row_reasoning_scope += 1
            else:
                s = self.get_action(step)
                if s:
                    action_scope += 1
                    if not s['errmsg']:
                        code_scope += 1
                        # whether action result is correct
                        reasoning_scope += self.soft_equal(pred, refer, s)

        result = dict(
            follow_acc=100 * (row_reasoning_scope + final_scope) / total,
            reasoning_acc=100 *
            (reasoning_scope + final_scope + row_reasoning_scope) / total,
            code_acc=100 * (code_scope + final_scope) / total,
            action_pct=100 * (action_scope + final_scope) / total,
        )
        return result
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`import json`
			`import os`
fix bug of gsm8k_postprocess (#863) * fix bug of gsm8k_postprocess * update postprocess --------- Co-authored-by: Lei Fei <SENSETIME\leifei1@cn3114002087l.domain.sensetime.com> Co-authored-by: Leymore <zfz-960727@163.com> 2024-02-06 23:52:47 +08:00			`import re`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`from os import environ`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00
			`from datasets import Dataset, DatasetDict`

[Sync] update (#517) 2023-10-27 20:31:22 +08:00			`from opencompass.openicl import BaseEvaluator`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`from opencompass.utils import get_data_path`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00
			`from .base import BaseDataset`


			`@LOAD_DATASET.register_module()`
			`class GSM8KDataset(BaseDataset):`

			`@staticmethod`
			`def load(path):`
[Feature] Support ModelScope datasets (#1289) * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * udpate dataset for modelscope support * update readme * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * update readme * remove tydiqa japanese subset * add ceval, gsm8k modelscope surpport * update race, mmlu, arc, cmmlu, commonsenseqa, humaneval and unittest * update bbh, flores, obqa, siqa, storycloze, summedits, winogrande, xsum datasets * format file * format file * update dataset format * support ms_dataset * udpate dataset for modelscope support * merge myl_dev and update test_ms_dataset * update readme * udpate dataset for modelscope support * update eval_api_zhipu_v2 * remove unused code * add get_data_path function * remove tydiqa japanese subset * update util * remove .DS_Store * fix md format * move util into package * update docs/get_started.md * restore eval_api_zhipu_v2.py, add environment setting * Update dataset * Update * Update * Update * Update --------- Co-authored-by: Yun lin <yunlin@U-Q9X2K4QV-1904.local> Co-authored-by: Yunnglin <mao.looper@qq.com> Co-authored-by: Yun lin <yunlin@laptop.local> Co-authored-by: Yunnglin <maoyl@smail.nju.edu.cn> Co-authored-by: zhangsongyang <zhangsongyang@pjlab.org.cn> 2024-07-29 13:48:32 +08:00			`path = get_data_path(path)`
			`if environ.get('DATASET_SOURCE') == 'ModelScope':`
			`from modelscope import MsDataset`
			`dataset = MsDataset.load(dataset_name=path, trust_remote_code=True)`
			`else:`
			`datasets = {}`
			`for split in ['train', 'test']:`
			`split_path = os.path.join(path, split + '.jsonl')`
			`dataset = []`
			`with open(split_path, 'r', encoding='utf-8') as f:`
			`for line in f:`
			`line = json.loads(line.strip())`
			`dataset.append(line)`
			`datasets[split] = Dataset.from_list(dataset)`
			`dataset = DatasetDict(datasets)`
			`return dataset`
Support a batch of datasets. 2023-07-05 09:01:25 +08:00

			`@TEXT_POSTPROCESSORS.register_module('gsm8k_dataset')`
			`def gsm8k_dataset_postprocess(text: str) -> str:`
			`return text.split('#### ')[1].replace(',', '')`


			`@TEXT_POSTPROCESSORS.register_module('gsm8k')`
			`def gsm8k_postprocess(text: str) -> str:`
[Sync] Merge branch 'dev' into zfz/update-keyset-demo (#876) 2024-02-05 23:29:10 +08:00			`text = text.split('Question:')[0]`
fix bug of gsm8k_postprocess (#863) * fix bug of gsm8k_postprocess * update postprocess --------- Co-authored-by: Lei Fei <SENSETIME\leifei1@cn3114002087l.domain.sensetime.com> Co-authored-by: Leymore <zfz-960727@163.com> 2024-02-06 23:52:47 +08:00			`numbers = re.findall(r'\-?\d+\.\d+\|\-?\d+', text)`
			`if not numbers:`
			`return 'NULL'`
			`return numbers[-1]`
[Sync] update (#517) 2023-10-27 20:31:22 +08:00

			`class Gsm8kEvaluator(BaseEvaluator):`

[Fix] Fix error in gsm8k evaluator (#782) Co-authored-by: jiangjin1999 <1261842974@qq.com> 2024-02-04 22:55:11 +08:00			`def is_equal(self, pred, refer):`
			`try:`
			`if pred == refer or abs(float(pred) - int(refer)) < 1e-6:`
			`return True`
			`except Exception:`
			`pass`
			`return False`

[Sync] update (#517) 2023-10-27 20:31:22 +08:00			`def score(self, predictions, references):`
			`if len(predictions) != len(references):`
			`return {`
			`'error': 'predictions and references have different '`
			`'length'`
			`}`
			`correct = 0`
			`count = 0`
			`details = []`
			`for i, j in zip(predictions, references):`
[Sync] update model configs (#574) 2023-11-13 15:15:34 +08:00			`detail = {'pred': i, 'answer': j, 'correct': False}`
[Sync] update (#517) 2023-10-27 20:31:22 +08:00			`count += 1`
[Fix] Fix error in gsm8k evaluator (#782) Co-authored-by: jiangjin1999 <1261842974@qq.com> 2024-02-04 22:55:11 +08:00			`if self.is_equal(i, j):`
[Sync] update (#517) 2023-10-27 20:31:22 +08:00			`correct += 1`
			`detail['correct'] = True`
			`details.append(detail)`
			`result = {'accuracy': 100 * correct / count, 'details': details}`
			`return result`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00

			`class Gsm8kAgentEvaluator(BaseEvaluator):`
			`"""Gsm8k agent evaluator for soft condition.`

			`Args:`
			`action (str): Action for catching internal prediction.`
			Defaults to `PythonInterpreter`.
			`"""`

			`def __init__(self, action: str = 'PythonInterpreter'):`
			`self.action = action`

[Feat] update gsm8k and math agent config (#652) * [Feat] update gsm8k and math agent config * minor fix 2023-12-01 15:08:38 +08:00			`def is_equal(self, pred, refer):`
			`try:`
			`if pred == refer or abs(float(pred) - int(refer)) < 1e-6:`
			`return True`
			`except Exception:`
			`pass`
			`return False`

[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`def soft_equal(self, pred, refer, step):`
			`try:`
			`soft_pred = step['result']['text']`
[Feat] update gsm8k and math agent config (#652) * [Feat] update gsm8k and math agent config * minor fix 2023-12-01 15:08:38 +08:00			`if abs(float(soft_pred) - int(refer)) < 1e-6:`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`return True`
			`except Exception:`
			`# result might not exists`
			`# text cannot convert to float`
[Feat] update gsm8k and math agent config (#652) * [Feat] update gsm8k and math agent config * minor fix 2023-12-01 15:08:38 +08:00			`pass`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`return False`

			`def get_action(self, step):`
			`for s in step[::-1]:`
			`if s['type'] == self.action:`
			`return s`

			`def score(self, predictions, references, steps):`
			`"""Calculate accuracy."""`
[Sync] update configs (#734) 2023-12-25 21:59:16 +08:00			`if len(predictions) != len(references):`
			`return {'error': 'preds and refrs have different length'}`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00
			`row_reasoning_scope = 0`
			`action_scope = 0`
			`code_scope = 0`
			`reasoning_scope = 0`
			`final_scope = 0`
			`total = len(references)`
			`for pred, refer, step in zip(predictions, references, steps):`
			`# if final answer right`
[Feat] update gsm8k and math agent config (#652) * [Feat] update gsm8k and math agent config * minor fix 2023-12-01 15:08:38 +08:00			`if self.is_equal(pred, refer):`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`if self.get_action(step):`
			`final_scope += 1`
			`else:`
			`row_reasoning_scope += 1`
			`else:`
			`s = self.get_action(step)`
			`if s:`
			`action_scope += 1`
			`if not s['errmsg']:`
			`code_scope += 1`
			`# whether action result is correct`
			`reasoning_scope += self.soft_equal(pred, refer, s)`

			`result = dict(`
			`follow_acc=100 * (row_reasoning_scope + final_scope) / total,`
			`reasoning_acc=100 *`
			`(reasoning_scope + final_scope + row_reasoning_scope) / total,`
			`code_acc=100 * (code_scope + final_scope) / total,`
[Sync] minor test (#683) 2023-12-11 17:42:53 +08:00			`action_pct=100 * (action_scope + final_scope) / total,`
[Feat] Support cibench (#538) * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * [Feat] support cidataset * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * rename cibench * rename cibench * rename cibench * rename cibench * minor fix * minor fix * minor fix 2023-11-07 19:11:44 +08:00			`)`
			`return result`