OpenCompass/opencompass/datasets/humaneval.py

# flake8: noqa: E501
# yapf: disable
import copy
import json
import os.path as osp
import re
import tempfile
from typing import List

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET

from .base import BaseDataset

HUMANEVAL_IMPORT_ERROR = '''\
Please install human_eval use following steps:
git clone git@github.com:open-compass/human-eval.git
cd human-eval && pip install -e .'''

HUMANEVAL_PLUS_IMPORT_ERROR = '''\
Please install evalplus use following steps:
git clone --recurse-submodules git@github.com:open-compass/human-eval.git
cd human-eval
pip install -e .
pip install -e evalplus'''

@LOAD_DATASET.register_module()
class HumanevalDataset(BaseDataset):

    @staticmethod
    def load(path: str, num_repeats: int = 1):
        """Load humaneval dataset for pass k mode.

        Note that you can use num_repeats > 1 when your model does not support
        `num_return_sequence` in generation, otherwise use the raw
        humaneval dataset and set `num_return_sequence` in model config to
        generate multiple responses for testing pass@k>1.

        It better to change your dataset abbr correspondingly if you want to
        change num_repeats>1, otherwise the number in
        `.cache/dataset_size.json` might be inconsistent.

        Args:
            num_repeats(int): Number of repetition for this dataset to get
        multiple responses in special cases.
        """
        dataset = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                dataset.extend([copy.deepcopy(line) for _ in range(num_repeats)])
        return Dataset.from_list(dataset)


class HumanEvalEvaluator(BaseEvaluator):
    """Evaluator for HumanEval or EvalPlus."""

    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
        try:
            import human_eval
        except ImportError:
            raise ImportError(HUMANEVAL_IMPORT_ERROR)

        self.k = k
        super().__init__()

    def score(self, predictions, references, test_set):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}

        from human_eval.data import HUMAN_EVAL, write_jsonl
        from human_eval.evaluation import evaluate_functional_correctness

        prompts = [item['prompt'] for item in test_set]
        humaneval_preds = []
        # create json file in human_eval format
        for preds, refer in zip(predictions, references):
            # suits for two case
            # 1. use repeated dataset
            # 2. use `num_return_sequences` to generate multiple responses
            if not isinstance(preds, list):
                preds = [preds]
            for pred in preds:
                humaneval_preds.append({'task_id': refer, 'completion': pred})
        with tempfile.TemporaryDirectory() as tmp_dir:
            out_dir = osp.join(tmp_dir, 'human_eval.json')
            write_jsonl(out_dir, humaneval_preds)
            score = evaluate_functional_correctness(out_dir, self.k, n_workers=4, timeout=3.0, problem_file=HUMAN_EVAL)

            detail_path = osp.join(tmp_dir, 'human_eval.json_results.jsonl')
            details = {}
            with open(detail_path, 'r') as f:
                for index, line in enumerate(f):
                    line = json.loads(line)
                    line['is_correct'] = line['passed']
                    line['prompt'] = prompts[index]
                    details[str(index)] = line

        results = {f'humaneval_{k}': score[k] * 100 for k in score}
        results['details'] = details
        return results


class HumanEvalPlusEvaluator(BaseEvaluator):
    """Evaluator for HumanEval or EvalPlus."""

    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
        try:
            import evalplus
        except ImportError:
            raise ImportError(HUMANEVAL_PLUS_IMPORT_ERROR)

        self.k = k
        super().__init__()

    def score(self, predictions, references, test_set):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}

        from evalplus.data import write_jsonl
        from evalplus.evaluate import evaluate

        prompts = [item['prompt'] for item in test_set]
        humaneval_preds = []
        for preds, refer, prompt in zip(predictions, references, prompts):
            if not isinstance(preds, list):
                preds = [preds]
            for pred in preds:
                humaneval_preds.append({'task_id': refer, 'solution': prompt + pred})
        with tempfile.TemporaryDirectory() as tmp_dir:
            out_dir = osp.join(tmp_dir, 'human_eval.jsonl')
            write_jsonl(out_dir, humaneval_preds)
            flags = dict(
                dataset='humaneval',
                samples=out_dir,
                base_only=None,
                parallel=None,
                i_just_wanna_run=None,
                test_details=0.2,
                min_time_limit=0.2,
                gt_time_limit_factor=4.0,
                mini=None,
            )
            score = evaluate(flags)
            results_path = osp.join(tmp_dir, 'human_eval_eval_results.json')
            with open(results_path, 'r') as f:
                results = json.load(f)
            details = {}
            for index in range(len(predictions)):
                r = results['eval'][references[index]]

                details[str(index)] = {
                    'prompt': prompts[index],
                    'prediction': predictions[index],
                    'reference': references[index],
                    'base_result': r['base'][0][0],
                    'plus_result': r['plus'][0][0],
                    'is_correct': r['base'][0][0] == 'success' and r['plus'][0][0] == 'success',
                }
                if r['nfiles'] > 1:
                    details[str(index)]['warning'] = 'Multiple files in the solution. Details may be wrong.'
        results = {f'humaneval_plus_{k}': score[k] * 100 for k in score}
        results['details'] = details
        return results


def humaneval_postprocess_v2(text: str) -> str:
    blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
    if len(blocks) >= 1:
        text = blocks[0]
    return text
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`# flake8: noqa: E501`
			`# yapf: disable`
			`import copy`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`import json`
initial commit 2023-07-04 21:34:55 +08:00			`import os.path as osp`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			`import re`
initial commit 2023-07-04 21:34:55 +08:00			`import tempfile`
			`from typing import List`

[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`from datasets import Dataset`

initial commit 2023-07-04 21:34:55 +08:00			`from opencompass.openicl.icl_evaluator import BaseEvaluator`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`from opencompass.registry import LOAD_DATASET`

			`from .base import BaseDataset`

[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`HUMANEVAL_IMPORT_ERROR = '''\`
			`Please install human_eval use following steps:`
			`git clone git@github.com:open-compass/human-eval.git`
			`cd human-eval && pip install -e .'''`

			`HUMANEVAL_PLUS_IMPORT_ERROR = '''\`
			`Please install evalplus use following steps:`
			`git clone --recurse-submodules git@github.com:open-compass/human-eval.git`
			`cd human-eval`
			`pip install -e .`
			`pip install -e evalplus'''`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00
			`@LOAD_DATASET.register_module()`
			`class HumanevalDataset(BaseDataset):`

			`@staticmethod`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`def load(path: str, num_repeats: int = 1):`
			`"""Load humaneval dataset for pass k mode.`

			`Note that you can use num_repeats > 1 when your model does not support`
			`num_return_sequence` in generation, otherwise use the raw
			humaneval dataset and set `num_return_sequence` in model config to
			`generate multiple responses for testing pass@k>1.`

			`It better to change your dataset abbr correspondingly if you want to`
			`change num_repeats>1, otherwise the number in`
			`.cache/dataset_size.json` might be inconsistent.

			`Args:`
			`num_repeats(int): Number of repetition for this dataset to get`
			`multiple responses in special cases.`
			`"""`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`dataset = []`
			`with open(path, 'r', encoding='utf-8') as f:`
			`for line in f:`
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`line = json.loads(line)`
			`dataset.extend([copy.deepcopy(line) for _ in range(num_repeats)])`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`return Dataset.from_list(dataset)`
initial commit 2023-07-04 21:34:55 +08:00

[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`class HumanEvalEvaluator(BaseEvaluator):`
[Feature] Support the use of humaneval_plus. (#720) * [Feature] Support the use of humaneval_plus. * [Feature] Add humaneval_plus_gen.py * minor check * [Fix] Fix bug --------- Co-authored-by: yingfhu <yingfhu@gmail.com> 2023-12-20 17:25:17 +08:00			`"""Evaluator for HumanEval or EvalPlus."""`
initial commit 2023-07-04 21:34:55 +08:00
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`def __init__(self, k: List[int] = [1, 10, 100]) -> None:`
			`try:`
			`import human_eval`
			`except ImportError:`
			`raise ImportError(HUMANEVAL_IMPORT_ERROR)`

initial commit 2023-07-04 21:34:55 +08:00			`self.k = k`
			`super().__init__()`

[Feature] Support the use of humaneval_plus. (#720) * [Feature] Support the use of humaneval_plus. * [Feature] Add humaneval_plus_gen.py * minor check * [Fix] Fix bug --------- Co-authored-by: yingfhu <yingfhu@gmail.com> 2023-12-20 17:25:17 +08:00			`def score(self, predictions, references, test_set):`
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`if len(predictions) != len(references):`
			`return {'error': 'preds and refrs have different length'}`

			`from human_eval.data import HUMAN_EVAL, write_jsonl`
			`from human_eval.evaluation import evaluate_functional_correctness`

[Feature] Support the use of humaneval_plus. (#720) * [Feature] Support the use of humaneval_plus. * [Feature] Add humaneval_plus_gen.py * minor check * [Fix] Fix bug --------- Co-authored-by: yingfhu <yingfhu@gmail.com> 2023-12-20 17:25:17 +08:00			`prompts = [item['prompt'] for item in test_set]`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`humaneval_preds = []`
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`# create json file in human_eval format`
			`for preds, refer in zip(predictions, references):`
			`# suits for two case`
			`# 1. use repeated dataset`
			# 2. use `num_return_sequences` to generate multiple responses
			`if not isinstance(preds, list):`
			`preds = [preds]`
			`for pred in preds:`
			`humaneval_preds.append({'task_id': refer, 'completion': pred})`
			`with tempfile.TemporaryDirectory() as tmp_dir:`
			`out_dir = osp.join(tmp_dir, 'human_eval.json')`
			`write_jsonl(out_dir, humaneval_preds)`
			`score = evaluate_functional_correctness(out_dir, self.k, n_workers=4, timeout=3.0, problem_file=HUMAN_EVAL)`

			`detail_path = osp.join(tmp_dir, 'human_eval.json_results.jsonl')`
			`details = {}`
			`with open(detail_path, 'r') as f:`
			`for index, line in enumerate(f):`
			`line = json.loads(line)`
			`line['is_correct'] = line['passed']`
			`line['prompt'] = prompts[index]`
			`details[str(index)] = line`

			`results = {f'humaneval_{k}': score[k] * 100 for k in score}`
			`results['details'] = details`
			`return results`


			`class HumanEvalPlusEvaluator(BaseEvaluator):`
			`"""Evaluator for HumanEval or EvalPlus."""`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`def __init__(self, k: List[int] = [1, 10, 100]) -> None:`
			`try:`
			`import evalplus`
			`except ImportError:`
			`raise ImportError(HUMANEVAL_PLUS_IMPORT_ERROR)`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`self.k = k`
			`super().__init__()`

			`def score(self, predictions, references, test_set):`
			`if len(predictions) != len(references):`
			`return {'error': 'preds and refrs have different length'}`

			`from evalplus.data import write_jsonl`
			`from evalplus.evaluate import evaluate`
[Feature] enhance the ability of humaneval_postprocess (#676) * [Feature] enhance the ability of humaneval_postprocess * refactor * [Feature] Keep the old version of the function and realize the new function in humaneval_postprocess_v2. * Update opencompass/datasets/humaneval.py --------- Co-authored-by: Leymore <zfz-960727@163.com> Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> 2023-12-11 14:39:56 +08:00
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00			`prompts = [item['prompt'] for item in test_set]`
			`humaneval_preds = []`
			`for preds, refer, prompt in zip(predictions, references, prompts):`
			`if not isinstance(preds, list):`
			`preds = [preds]`
			`for pred in preds:`
			`humaneval_preds.append({'task_id': refer, 'solution': prompt + pred})`
			`with tempfile.TemporaryDirectory() as tmp_dir:`
			`out_dir = osp.join(tmp_dir, 'human_eval.jsonl')`
			`write_jsonl(out_dir, humaneval_preds)`
			`flags = dict(`
			`dataset='humaneval',`
			`samples=out_dir,`
			`base_only=None,`
			`parallel=None,`
			`i_just_wanna_run=None,`
			`test_details=0.2,`
			`min_time_limit=0.2,`
			`gt_time_limit_factor=4.0,`
			`mini=None,`
			`)`
			`score = evaluate(flags)`
			`results_path = osp.join(tmp_dir, 'human_eval_eval_results.json')`
			`with open(results_path, 'r') as f:`
			`results = json.load(f)`
			`details = {}`
			`for index in range(len(predictions)):`
			`r = results['eval'][references[index]]`

			`details[str(index)] = {`
			`'prompt': prompts[index],`
			`'prediction': predictions[index],`
			`'reference': references[index],`
			`'base_result': r['base'][0][0],`
			`'plus_result': r['plus'][0][0],`
			`'is_correct': r['base'][0][0] == 'success' and r['plus'][0][0] == 'success',`
			`}`
			`if r['nfiles'] > 1:`
			`details[str(index)]['warning'] = 'Multiple files in the solution. Details may be wrong.'`
			`results = {f'humaneval_plus_{k}': score[k] * 100 for k in score}`
			`results['details'] = details`
			`return results`
[Feature] enhance the ability of humaneval_postprocess (#676) * [Feature] enhance the ability of humaneval_postprocess * refactor * [Feature] Keep the old version of the function and realize the new function in humaneval_postprocess_v2. * Update opencompass/datasets/humaneval.py --------- Co-authored-by: Leymore <zfz-960727@163.com> Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> 2023-12-11 14:39:56 +08:00
[Sync] Sync with internal codes 2024.06.28 (#1279) 2024-06-28 14:16:34 +08:00
			`def humaneval_postprocess_v2(text: str) -> str:`
			blocks = re.findall(r'```\w\n(.?)```', text, re.DOTALL)
			`if len(blocks) >= 1:`
			`text = blocks[0]`
initial commit 2023-07-04 21:34:55 +08:00			`return text`