[Enhancement] Add humaneval postprocessor for GPT models & eval config for GPT4, enhance the original humaneval postprocessor (#129)

* [Enhancement] Enhance humaneval postprocessor * add human-eval testcase * update * update --------- Co-authored-by: Leymore <zfz-960727@163.com>
2025-05-30 16:03:24 +08:00 · 2023-08-10 16:31:12 +08:00 · 2023-08-10 16:31:12 +08:00 · 2931f3dcb8
commit 2931f3dcb8
parent 3f36db3b06
4 changed files with 191 additions and 7 deletions
--- a/configs/datasets/glm/humaneval.py
+++ b/configs/datasets/glm/humaneval.py
@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HFDataset, HumanEvaluator
+from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess
 humaneval_reader_cfg = dict(
    input_columns=['prompt'], output_column='task_id', train_split='test')
@ -17,7 +17,7 @@ humaneval_infer_cfg = dict(
 humaneval_eval_cfg = dict(
    evaluator=dict(type=HumanEvaluator),
    k=[1, 10, 100],  # the parameter only for humaneval
-    pred_postprocessor=dict(type='humaneval'),
+    pred_postprocessor=dict(type=humaneval_postprocess),
 )
 humaneval_datasets = [
--- a/configs/eval_gpt4.py
+++ b/configs/eval_gpt4.py
@ -0,0 +1,40 @@
 from mmengine.config import read_base
 from opencompass.models import OpenAI
 from opencompass.partitioners import NaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
    from .datasets.collections.chat_medium import datasets
    from .summarizers.medium import summarizer
 # GPT4 needs a special humaneval postprocessor
 from opencompass.datasets.humaneval import humaneval_gpt_postprocess
 for _dataset in datasets:
    if _dataset['path'] == 'openai_humaneval':
        _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
 api_meta_template = dict(
    round=[
            dict(role='HUMAN', api_role='HUMAN'),
            dict(role='BOT', api_role='BOT', generate=True),
    ],
 )
 models = [
    dict(abbr='GPT4',
        type=OpenAI, path='gpt-4-0613',
        key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
        meta_template=api_meta_template,
        query_per_second=1,
        max_out_len=2048, max_seq_len=2048, batch_size=8),
 ]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=LocalRunner,
        max_num_workers=4,
        task=dict(type=OpenICLInferTask)),
 )
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@ -1,12 +1,11 @@
 import os.path as osp
 import re
 import tempfile
 from typing import List
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, TEXT_POSTPROCESSORS
@ICL_EVALUATORS.register_module()
 class HumanEvaluator(BaseEvaluator):
    """Evaluator for human eval."""
@ -41,11 +40,46 @@ class HumanEvaluator(BaseEvaluator):
            return {f'humaneval_{k}': score[k] * 100 for k in score}
@TEXT_POSTPROCESSORS.register_module('humaneval')
 def humaneval_postprocess(text: str) -> str:
    text = text.split('\n\n')[0]
    if '```' in text:
-        text = text.split('```')[1]
+        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
        if len(blocks) == 0:
            text = text.split('```')[1]  # fall back to default strategy
        else:
            text = blocks[0]  # fetch the first code block
            if not text.startswith('\n'):  # in case starting with ```python
                text = text[max(text.find('\n') + 1, 0):]
    if text.strip().startswith('from') or text.strip().startswith('import'):
        def_idx = text.find('def')
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    text = text.split('\n\n')[0]
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
        if text.startswith(' '):
            text = '    ' + text.lstrip()
        else:
            text = '\n'.join(['    ' + line for line in text.split('\n')])
    return text
 def humaneval_gpt_postprocess(text: str) -> str:
    """Better answer postprocessor for better instruction-aligned models like
    GPT."""
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
        if len(blocks) == 0:
            text = text.split('```')[1]  # fall back to default strategy
        else:
            text = blocks[0]  # fetch the first code block
            if not text.startswith('\n'):  # in case starting with ```python
                text = text[max(text.find('\n') + 1, 0):]
    if text.strip().startswith('from') or text.strip().startswith('import'):
        def_idx = text.find('def')
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    text = text.split('\n\n\n')[0]
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
--- a/tests/dataset/test_humaneval.py
+++ b/tests/dataset/test_humaneval.py
@ -0,0 +1,110 @@
 import unittest
 from opencompass.datasets.humaneval import humaneval_postprocess
 def run_humaneval_check(completion):
    program = [
        'def get_fraction(x: float) -> float:',
        humaneval_postprocess(completion),
        '',
        'assert get_fraction(1.28) == 0.28',
        'assert get_fraction(1.0) == 0.0',
    ]
    program = '\n'.join(program)
    exec(program)
 class TestHumaneval(unittest.TestCase):
    def test_vanilla(self):
        raw = '    return x - int(x)'
        run_humaneval_check(raw)
    def test_python_quote(self):
        lines = [
            '```python',
            '    return x - int(x)',
            '```',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_bare_quote(self):
        lines = [
            '```',
            '    return x - int(x)',
            '```',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_error_space_quote(self):
        lines = [
            '```',
            '  return x - int(x)',
            '```',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_import_1(self):
        lines = [
            'import numpy as np',
            'import math',
            'from typing import List',
            '',
            'def func(x):',
            '    return x - int(x)',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_import_2(self):
        lines = [
            'from typing import List',
            'import numpy as np',
            'import math',
            'def func(x):',
            '    return x - int(x)',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_import_3(self):
        lines = [
            'import math',
            '',
            '',
            'def func(x):',
            '    return x - int(x)',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_comment(self):
        lines = [
            'def func(x: float) -> float:',
            "    '''",
            '    blah blah blah',
            '    blah blah blah',
            "    '''",
            '    return x - int(x)',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)
    def test_additional(self):
        lines = [
            '    return x - int(x)',
            '',
            '',
            'def func(x: float) -> float:',
            "    '''",
            '    blah blah blah',
            '    blah blah blah',
            "    '''",
            '    return x - int(x)',
        ]
        raw = '\n'.join(lines)
        run_humaneval_check(raw)