OpenCompass/opencompass/datasets/humaneval.py

import json
import os.path as osp
import re
import tempfile
from typing import List

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class HumanevalDataset(BaseDataset):

    @staticmethod
    def load(path: str, num_repeats: int = 1):
        """Load humaneval dataset for pass k mode.

        Note that you can use num_repeats > 1 when your model does not support
        `num_return_sequence` in generation, otherwise use the raw
        humaneval dataset and set `num_return_sequence` in model config to
        generate multiple responses for testing pass@k>1.

        It better to change your dataset abbr correspondingly if you want to
        change num_repeats>1, otherwise the number in
        `.cache/dataset_size.json` might be inconsistent.

        Args:
            num_repeats(int): Number of repetition for this dataset to get
        multiple responses in special cases.
        """
        dataset = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                dataset.extend(
                    [json.loads(line.strip()) for _ in range(num_repeats)])
        return Dataset.from_list(dataset)


class HumanEvaluator(BaseEvaluator):
    """Evaluator for human eval."""

    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
        try:
            from human_eval.data import HUMAN_EVAL, write_jsonl
            from human_eval.evaluation import evaluate_functional_correctness
            self.write_jsonl = write_jsonl
            self.HUMAN_EVAL = HUMAN_EVAL
            self.eval = evaluate_functional_correctness
        except ImportError:
            raise ImportError('Please install human_eval following'
                              'https://github.com/openai/human-eval/tree/'
                              'master#installation first.')
        self.k = k
        super().__init__()

    def score(self, predictions, references):
        humaneval_preds = []
        # create json file in human_eval format
        for preds, refer in zip(predictions, references):
            # suits for two case
            # 1. use repeated dataset
            # 2. use `num_return_sequences` to generate multiple responses
            if not isinstance(preds, list):
                preds = [preds]
            for pred in preds:
                humaneval_preds.append({'task_id': refer, 'completion': pred})
        with tempfile.TemporaryDirectory() as tmp_dir:
            out_dir = osp.join(tmp_dir, 'human_eval.json')
            self.write_jsonl(out_dir, humaneval_preds)
            score = self.eval(out_dir,
                              self.k,
                              n_workers=4,
                              timeout=3.0,
                              problem_file=self.HUMAN_EVAL)
            return {f'humaneval_{k}': score[k] * 100 for k in score}


def humaneval_postprocess(text: str) -> str:
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
        if len(blocks) == 0:
            text = text.split('```')[1]  # fall back to default strategy
        else:
            text = blocks[0]  # fetch the first code block
            if not text.startswith('\n'):  # in case starting with ```python
                text = text[max(text.find('\n') + 1, 0):]
    if text.strip().startswith('from') or text.strip().startswith('import'):
        def_idx = text.find('def')
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    text = text.split('\n\n')[0]
    text = text.lstrip('\n')
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
        if text.startswith(' '):
            text = '    ' + text.lstrip()
        else:
            text = '\n'.join(['    ' + line for line in text.split('\n')])
    return text


def humaneval_postprocess_v2(text: str) -> str:
    """This is an advanced version of previous postprocess to handle more
    situations, better to use this one."""
    text = text.lstrip('\n')
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
        if len(blocks) == 0:
            text = text.split('```')[1]  # fall back to default strategy
        else:
            text = blocks[0]  # fetch the first code block
            if not text.startswith('\n'):  # in case starting with ```python
                text = text[max(text.find('\n') + 1, 0):]
    if text.strip().startswith('from') or text.strip().startswith('import'):
        def_idx = text.find('def')
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    # remove empty lines
    text = '\n'.join([line for line in text.split('\n') if line != ''])
    text = text.lstrip('\n')
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
        if text.startswith(' '):
            text = '    ' + text.lstrip()
        else:
            text = '\n'.join(['    ' + line for line in text.split('\n')])
    text = text.split('\n')

    # If number of leading space reduces, we assume that the code block ends.
    min_leading_space = None
    end_index = None
    for index, line in enumerate(text):
        if line.strip() == '' or line.strip()[0] in ["'", '"', '#']:
            continue
        current_leading_space = len(line.rstrip()) - len(line.strip())
        if min_leading_space is None:
            min_leading_space = current_leading_space
        elif current_leading_space < min_leading_space:
            end_index = index
            break
    if end_index is not None:
        text = '\n'.join(text[:end_index])
    else:
        text = '\n'.join(text)
    return text


def humaneval_gpt_postprocess(text: str) -> str:
    """Better answer postprocessor for better instruction-aligned models like
    GPT."""
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
        if len(blocks) == 0:
            text = text.split('```')[1]  # fall back to default strategy
        else:
            text = blocks[0]  # fetch the first code block
            if not text.startswith('\n'):  # in case starting with ```python
                text = text[max(text.find('\n') + 1, 0):]
    if text.strip().startswith('from') or text.strip().startswith('import'):
        def_idx = text.find('def')
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    text = text.split('\n\n\n')[0]
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
        if text.startswith(' '):
            text = '    ' + text.lstrip()
        else:
            text = '\n'.join(['    ' + line for line in text.split('\n')])
    return text
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`import json`
initial commit 2023-07-04 21:34:55 +08:00			`import os.path as osp`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			`import re`
initial commit 2023-07-04 21:34:55 +08:00			`import tempfile`
			`from typing import List`

[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`from datasets import Dataset`

initial commit 2023-07-04 21:34:55 +08:00			`from opencompass.openicl.icl_evaluator import BaseEvaluator`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`from opencompass.registry import LOAD_DATASET`

			`from .base import BaseDataset`


			`@LOAD_DATASET.register_module()`
			`class HumanevalDataset(BaseDataset):`

			`@staticmethod`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`def load(path: str, num_repeats: int = 1):`
			`"""Load humaneval dataset for pass k mode.`

			`Note that you can use num_repeats > 1 when your model does not support`
			`num_return_sequence` in generation, otherwise use the raw
			humaneval dataset and set `num_return_sequence` in model config to
			`generate multiple responses for testing pass@k>1.`

			`It better to change your dataset abbr correspondingly if you want to`
			`change num_repeats>1, otherwise the number in`
			`.cache/dataset_size.json` might be inconsistent.

			`Args:`
			`num_repeats(int): Number of repetition for this dataset to get`
			`multiple responses in special cases.`
			`"""`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`dataset = []`
			`with open(path, 'r', encoding='utf-8') as f:`
			`for line in f:`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`dataset.extend(`
			`[json.loads(line.strip()) for _ in range(num_repeats)])`
[Feature] Use dataset in local path (#570) * update commonsenseqa * update drop * update flores_first100 * update gsm8k * update humaneval * update lambda * update obqa * update piqa * update race * update siqa * update story_cloze * update strategyqa * update tydiqa * update winogrande * update doc * update hellaswag * fix obqa * update collections * update .zip name 2023-11-13 13:00:37 +08:00			`return Dataset.from_list(dataset)`
initial commit 2023-07-04 21:34:55 +08:00

			`class HumanEvaluator(BaseEvaluator):`
			`"""Evaluator for human eval."""`

			`def __init__(self, k: List[int] = [1, 10, 100]) -> None:`
			`try:`
			`from human_eval.data import HUMAN_EVAL, write_jsonl`
			`from human_eval.evaluation import evaluate_functional_correctness`
			`self.write_jsonl = write_jsonl`
			`self.HUMAN_EVAL = HUMAN_EVAL`
			`self.eval = evaluate_functional_correctness`
			`except ImportError:`
			`raise ImportError('Please install human_eval following'`
			`'https://github.com/openai/human-eval/tree/'`
			`'master#installation first.')`
			`self.k = k`
			`super().__init__()`

			`def score(self, predictions, references):`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`humaneval_preds = []`
			`# create json file in human_eval format`
			`for preds, refer in zip(predictions, references):`
			`# suits for two case`
			`# 1. use repeated dataset`
			# 2. use `num_return_sequences` to generate multiple responses
			`if not isinstance(preds, list):`
			`preds = [preds]`
			`for pred in preds:`
			`humaneval_preds.append({'task_id': refer, 'completion': pred})`
initial commit 2023-07-04 21:34:55 +08:00			`with tempfile.TemporaryDirectory() as tmp_dir:`
			`out_dir = osp.join(tmp_dir, 'human_eval.json')`
[Feat] support humaneval and mbpp pass@k (#598) * [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-11-16 21:22:06 +08:00			`self.write_jsonl(out_dir, humaneval_preds)`
Update configs (#9) * Update implements * Update 2023-07-06 12:27:41 +08:00			`score = self.eval(out_dir,`
			`self.k,`
			`n_workers=4,`
			`timeout=3.0,`
			`problem_file=self.HUMAN_EVAL)`
initial commit 2023-07-04 21:34:55 +08:00			`return {f'humaneval_{k}': score[k] * 100 for k in score}`


			`def humaneval_postprocess(text: str) -> str:`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			if '```' in text:
			blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
			`if len(blocks) == 0:`
			text = text.split('```')[1] # fall back to default strategy
			`else:`
			`text = blocks[0] # fetch the first code block`
			if not text.startswith('\n'): # in case starting with ```python
			`text = text[max(text.find('\n') + 1, 0):]`
			`if text.strip().startswith('from') or text.strip().startswith('import'):`
			`def_idx = text.find('def')`
			`if def_idx != -1:`
			`text = text[max(text.find('\n', def_idx) + 1, 0):]`
initial commit 2023-07-04 21:34:55 +08:00			`text = text.split('\n\n')[0]`
[Sync] some renaming (#641) 2023-11-27 16:06:49 +08:00			`text = text.lstrip('\n')`
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			`if text.strip().startswith('def'):`
			`text = '\n'.join(text.split('\n')[1:])`
			`if not text.startswith(' '):`
			`if text.startswith(' '):`
			`text = ' ' + text.lstrip()`
			`else:`
			`text = '\n'.join([' ' + line for line in text.split('\n')])`
			`return text`


[Feature] enhance the ability of humaneval_postprocess (#676) * [Feature] enhance the ability of humaneval_postprocess * refactor * [Feature] Keep the old version of the function and realize the new function in humaneval_postprocess_v2. * Update opencompass/datasets/humaneval.py --------- Co-authored-by: Leymore <zfz-960727@163.com> Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> 2023-12-11 14:39:56 +08:00			`def humaneval_postprocess_v2(text: str) -> str:`
			`"""This is an advanced version of previous postprocess to handle more`
			`situations, better to use this one."""`
			`text = text.lstrip('\n')`
			if '```' in text:
			blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
			`if len(blocks) == 0:`
			text = text.split('```')[1] # fall back to default strategy
			`else:`
			`text = blocks[0] # fetch the first code block`
			if not text.startswith('\n'): # in case starting with ```python
			`text = text[max(text.find('\n') + 1, 0):]`
			`if text.strip().startswith('from') or text.strip().startswith('import'):`
			`def_idx = text.find('def')`
			`if def_idx != -1:`
			`text = text[max(text.find('\n', def_idx) + 1, 0):]`
			`# remove empty lines`
			`text = '\n'.join([line for line in text.split('\n') if line != ''])`
			`text = text.lstrip('\n')`
			`if text.strip().startswith('def'):`
			`text = '\n'.join(text.split('\n')[1:])`
			`if not text.startswith(' '):`
			`if text.startswith(' '):`
			`text = ' ' + text.lstrip()`
			`else:`
			`text = '\n'.join([' ' + line for line in text.split('\n')])`
			`text = text.split('\n')`

			`# If number of leading space reduces, we assume that the code block ends.`
			`min_leading_space = None`
			`end_index = None`
			`for index, line in enumerate(text):`
			`if line.strip() == '' or line.strip()[0] in ["'", '"', '#']:`
			`continue`
			`current_leading_space = len(line.rstrip()) - len(line.strip())`
			`if min_leading_space is None:`
			`min_leading_space = current_leading_space`
			`elif current_leading_space < min_leading_space:`
			`end_index = index`
			`break`
			`if end_index is not None:`
			`text = '\n'.join(text[:end_index])`
			`else:`
			`text = '\n'.join(text)`
			`return text`


[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			`def humaneval_gpt_postprocess(text: str) -> str:`
			`"""Better answer postprocessor for better instruction-aligned models like`
			`GPT."""`
initial commit 2023-07-04 21:34:55 +08:00			if '```' in text:
[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129) * [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com> 2023-08-10 16:31:12 +08:00			blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
			`if len(blocks) == 0:`
			text = text.split('```')[1] # fall back to default strategy
			`else:`
			`text = blocks[0] # fetch the first code block`
			if not text.startswith('\n'): # in case starting with ```python
			`text = text[max(text.find('\n') + 1, 0):]`
			`if text.strip().startswith('from') or text.strip().startswith('import'):`
			`def_idx = text.find('def')`
			`if def_idx != -1:`
			`text = text[max(text.find('\n', def_idx) + 1, 0):]`
			`text = text.split('\n\n\n')[0]`
Update configs (#9) * Update implements * Update 2023-07-06 12:27:41 +08:00			`if text.strip().startswith('def'):`
initial commit 2023-07-04 21:34:55 +08:00			`text = '\n'.join(text.split('\n')[1:])`
			`if not text.startswith(' '):`
			`if text.startswith(' '):`
			`text = ' ' + text.lstrip()`
			`else:`
			`text = '\n'.join([' ' + line for line in text.split('\n')])`
			`return text`