[Dataset] Add human_eval/mbpp pro (#2092)

* add bench * update * bug fix * time update * add index * fix repeat bug
2025-05-30 16:03:24 +08:00 · 2025-05-12 18:38:13 +08:00 · 2025-05-12 18:38:13 +08:00 · 2c79dc5227
commit 2c79dc5227
parent 345674f700
18 changed files with 593 additions and 106 deletions
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -611,6 +611,12 @@
    paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
    configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
    configpath_llmjudge: ''
+- humaneval_pro:
+    name: HumanEval Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+    configpath_llmjudge: ''
 - hungarian_math:
    name: Hungarian_Math
    category: Math
@ -695,6 +701,12 @@
    paper: ''
    configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
    configpath_llmjudge: ''
+- mbpp_pro:
+    name: MBPP Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+    configpath_llmjudge: ''
 - mgsm:
    name: MGSM
    category: Language / Math
--- a/opencompass/configs/datasets/humaneval_pro/README.md
+++ b/opencompass/configs/datasets/humaneval_pro/README.md
@ -0,0 +1,17 @@
+# HumanEval pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     67   |
+|  deepseek-v2-lite-chat-hf  |     35   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     28   |
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .humaneval_pro_gen_3dc067 import humanevalpro_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,)
+]
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/mbpp_pro/README.md
+++ b/opencompass/configs/datasets/mbpp_pro/README.md
@ -0,0 +1,17 @@
+# MBPP pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     66   |
+|  qwen2.5-14b-instruct-hf   |     64   |
+|  deepseek-v2-lite-chat-hf  |     36   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     39   |
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mbpp_pro_gen_3dc067 import mbpppro_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg)
+]
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/multipl_e/multiple_gen.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .multiple_top_ten_gen_f44aaf import multiple_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
@ -32,7 +32,6 @@ multiple_datasets = [
        type=MultiplEDataset,
        abbr=f'humaneval-multiple-{lang}',
        language=lang,
-        num_repeats=1,
        path='opencompass/multipl_e',
        tag='humaneval',
        reader_cfg=multiple_reader_cfg,
@ -46,7 +45,6 @@ multiple_datasets += [
        type=MultiplEDataset,
        abbr=f'mbpp-multiple-{lang}',
        language=lang,
-        num_repeats=1,
        path='opencompass/multipl_e',
        tag='mbpp',
        reader_cfg=multiple_reader_cfg,
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
@ -0,0 +1,58 @@
+# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
+
+
+_TOP_TEN_LANGUAGE_ = ['cpp']
+
+multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
+
+multiple_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+multiple_eval_cfg = {
+    lang: dict(
+        evaluator=dict(
+            type=MultiplEEvaluator,
+            language=lang,
+            ip_address='https://opencompass-multiple-evaluator.hf.space',
+        ),
+        pred_role='BOT',
+    ) for lang in _TOP_TEN_LANGUAGE_
+}
+
+multiple_datasets = [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'humaneval-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='humaneval',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
+
+multiple_datasets += [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'mbpp-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='mbpp',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -64,6 +64,7 @@ from .hle import *  # noqa: F401, F403
 from .huggingface import *  # noqa: F401, F403
 from .humaneval import *  # noqa: F401, F403
 from .humaneval_multi import *  # noqa: F401, F403
+from .humaneval_pro import *  # noqa: F401, F403
 from .humanevalx import *  # noqa: F401, F403
 from .hungarian_math import *  # noqa: F401, F403
 from .IFEval.ifeval import IFEvalDataset, IFEvaluator  # noqa: F401, F403
@ -96,6 +97,7 @@ from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
+from .mbpp_pro import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
 from .MedCalc_Bench import MedCalc_BenchDataset  # noqa: F401
 from .MedCalc_Bench import MedCalcOfficial_Evaluator  # noqa: F401
--- a/opencompass/datasets/humaneval_pro.py
+++ b/opencompass/datasets/humaneval_pro.py
@ -0,0 +1,81 @@
+# flake8: noqa: E501s
+
+import json
+from typing import Dict, List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+class HumanevalevalProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            raw_data = json.load(f)
+            for data in raw_data:
+                dataset.append(data)
+        return Dataset.from_list(dataset)
+
+
+class HumanevalProEvaluator(CodeEvaluator):
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+
+        # 1. Prepare data for all test cases
+        all_test_cases, prompts = [], []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completion = predictions[i]
+
+            # Process code completions
+            processed_completion = self._process_completions(completion)
+            code = processed_completion + '\n' + test_case['test_code']
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'code': code,
+            }
+            all_test_cases.append(sub_data_dict)
+
+            prompt = PROMPT_WRAPPER.format(
+                raw_problem=test_case['raw_problem'],
+                new_problem=test_case['new_problem'])
+            prompts.append(prompt)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        return self._process_results(outputs, prompts, len(test_set_origin))
--- a/opencompass/datasets/mbpp_pro.py
+++ b/opencompass/datasets/mbpp_pro.py
@ -0,0 +1,81 @@
+# flake8: noqa: E501
+
+import json
+from typing import Dict, List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+class MBPPProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        print(path)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            for line in f:
+                dataset.append(json.loads(line.strip()))
+        return Dataset.from_list(dataset)
+
+
+class MBPPProEvaluator(CodeEvaluator):
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+
+        # 1. Prepare data for all test cases
+        all_test_cases, prompts = [], []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completion = predictions[i]
+
+            # Process code completions
+            processed_completion = self._process_completions(completion)
+            code = processed_completion + '\n' + test_case['test_code']
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'code': code,
+            }
+            all_test_cases.append(sub_data_dict)
+
+            prompt = PROMPT_WRAPPER.format(
+                raw_problem=test_case['raw_problem'],
+                new_problem=test_case['new_problem'])
+            prompts.append(prompt)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        return self._process_results(outputs, prompts, len(test_set_origin))
--- a/opencompass/datasets/multipl_e.py
+++ b/opencompass/datasets/multipl_e.py
@ -1,3 +1,4 @@
+import difflib
 import json
 import os.path as osp

@ -28,7 +29,6 @@ class MultiplEDataset(BaseDataset):
    @staticmethod
    def load(path: str,
             language: str,
-             num_repeats: int = 1,
             tag: str = 'humaneval',
             local_mode: bool = False):
        """Load dataset for pass k mode.
@ -56,8 +56,7 @@ class MultiplEDataset(BaseDataset):
        dataset = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
-                dataset.extend(
-                    [json.loads(line.strip()) for _ in range(num_repeats)])
+                dataset.append(json.loads(line.strip()))
        return Dataset.from_list(dataset)


@ -84,20 +83,56 @@ class MultiplEEvaluator(CodeEvaluator):
                min_stop_index = stop_index
        return decoded_string[:min_stop_index]

-    def _process_completions(self, test_case, completions):
+    def _remove_prefix(self,
+                       prompt: str,
+                       completion: str,
+                       threshold: float = 0.95) -> str:
+        """Determine the truncation point in the completion based on the last
+        line of the prompt, remove all content before that line in the
+        completion, and return the completion string after removing the prefix.
+        This is done to convert chatbot-style inference mode to completion
+        mode.
+
+        Args:
+            prompt (str): The prompt text.
+            completion (str): The completion text.
+            threshold (float): Line similarity threshold.
+
+        Returns:
+            str: The completion string after removing the prefix.
+        """
+        prompt_lines = prompt.splitlines()
+        completion_lines = completion.splitlines()
+
+        if not prompt_lines:
+            return completion
+
+        last_prompt_line = prompt_lines[-1]
+        cut_index = -1
+
+        for i, completion_line in enumerate(completion_lines):
+            similarity = difflib.SequenceMatcher(None, last_prompt_line,
+                                                 completion_line).ratio()
+            if similarity >= threshold:
+                cut_index = i
+                break
+
+        if cut_index != -1:
+            return '\n'.join(completion_lines[cut_index + 1:])
+        else:
+            return completion
+
+    def _process_completions(self, test_case, completion):
        """Process completions with a test case.

        Args:
-            test_case: A test case.
-            completions: A list of completions.
+            test_case (dict): A test case containing prompt and stop tokens.
+            completion (str): The generated code completion.
        Returns:
-            A list of processed completions.
+            str: Processed code completion.
        """
-        processed_completions = []
-        for comp in completions:
-            comp = self._extract_code(comp)
-            post_comp = self._remove_prefix(test_case['prompt'], comp)
-            post_comp = self._stop_at_stop_token(post_comp,
-                                                 test_case['stop_tokens'])
-            processed_completions.append(post_comp)
-        return processed_completions
+        post_comp = self._extract_code(completion)
+        post_comp = self._remove_prefix(test_case['prompt'], post_comp)
+        post_comp = self._stop_at_stop_token(post_comp,
+                                             test_case['stop_tokens'])
+        return post_comp
--- a/opencompass/openicl/icl_evaluator/code_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/code_evaluator.py
@ -1,12 +1,12 @@
 # flake8: noqa: E501

-import difflib
 import os
 import re
 import tempfile
 import time
 from typing import Any, Dict, List, Optional, Tuple, Union

+import numpy as np
 from datasets import Dataset
 from gradio_client import Client

@ -24,9 +24,9 @@ class CodeEvaluator(BaseEvaluator):
    """

    def __init__(self,
-                 language: str,
+                 language: str = 'py',
                 ip_address: str = 'localhost',
-                 retry: int = 3) -> None:
+                 retry: int = 5) -> None:
        """Initialize the CodeEvaluator.

        Args:
@ -71,6 +71,7 @@ class CodeEvaluator(BaseEvaluator):
                - output (dict/list/str): Evaluation results or error message
        """
        try:
+            import requests
            temp_file_path = None
            # Handle file path input
            if isinstance(input_data, str):
@ -83,7 +84,15 @@ class CodeEvaluator(BaseEvaluator):
                input_data = temp_file_path

            # Send to evaluation service
-            result = self.client.predict(input_data, api_name='/evaluate')
+            try:
+                result = self.client.predict(input_data, api_name='/evaluate')
+            except Exception as e:
+                # Catch timeout and other exceptions
+                if 'timed out' in str(e).lower() or 'timeout' in str(
+                        e).lower():
+                    return False, f'Request to code eval service timed out: {e}'
+                else:
+                    raise

            # Process the result
            if isinstance(result, (dict, list)):
@ -107,63 +116,16 @@ class CodeEvaluator(BaseEvaluator):
                except:  # noqa: E722
                    pass

-    def _remove_prefix(self,
-                       prompt: str,
-                       completion: str,
-                       threshold: float = 0.95) -> str:
-        """Determine the truncation point in the completion based on the last
-        line of the prompt, remove all content before that line in the
-        completion, and return the completion string after removing the prefix.
-        This is done to convert chatbot-style inference mode to completion
-        mode.
+    def _process_completions(self, completion: str) -> list:
+        """Process code completions to extract the relevant code.

        Args:
-            prompt (str): The prompt text.
-            completion (str): The completion text.
-            threshold (float): Line similarity threshold.
-
+            completion (str): Code completion string.
        Returns:
-            str: The completion string after removing the prefix.
+            list: List of processed code completions.
        """
-        prompt_lines = prompt.splitlines()
-        completion_lines = completion.splitlines()
-
-        if not prompt_lines:
-            return completion
-
-        last_prompt_line = prompt_lines[-1]
-        cut_index = -1
-
-        for i, completion_line in enumerate(completion_lines):
-            similarity = difflib.SequenceMatcher(None, last_prompt_line,
-                                                 completion_line).ratio()
-            if similarity >= threshold:
-                cut_index = i
-                break
-
-        if cut_index != -1:
-            return '\n'.join(completion_lines[cut_index + 1:])
-        else:
-            return completion
-
-    def _process_completions(self, test_case: dict, completions: list) -> list:
-        """Process code completion list, which typically involves extracting
-        code, removing repetitive prefixes caused by chatbot mode, and other
-        steps to ensure the model-generated code can be compiled successfully.
-
-        Args:
-            test_case (dict): Dictionary containing test case information including:
-            completions (list): List of code completions generated by the model.
-
-        Returns:
-            list: Processed code completion list.
-        """
-        processed_completions = []
-        for comp in completions:
-            comp = self._extract_code(comp)
-            post_comp = self._remove_prefix(test_case['prompt'], comp)
-            processed_completions.append(post_comp)
-        return processed_completions
+        post_comp = self._extract_code(completion)
+        return post_comp

    def _evaluate(
        self, input_data: Union[Dict, List]
@ -186,7 +148,7 @@ class CodeEvaluator(BaseEvaluator):
            succeed, output = self._code_eval_service(input_data)
            if not succeed:
                num_retry += 1
-                time.sleep(10)
+                time.sleep(30)
            else:
                break

@ -195,6 +157,31 @@ class CodeEvaluator(BaseEvaluator):

        return True, output, None

+    def _process_results(self, outputs: List, prompts: List,
+                         total_count: int) -> Dict:
+        """Process the evaluation results.
+        Args:
+            outputs (list): List of evaluation results for each test case.
+            prompts (list): List of prompts used for each test case.
+            total_count (int): Total number of test cases.
+        Returns:
+            dict: Processed results including:
+                - pass@1: Percentage of test cases passed
+                - details: Detailed results for each test case
+        """
+        details = []
+        correct = 0
+        for output, prompt in zip(outputs, prompts):
+            output['prompt'] = prompt
+            if output.get('status') == 'OK':
+                output['correct'] = True
+                correct += 1
+            else:
+                output['correct'] = False
+            details.append(output)
+
+        return {f'pass@1': 100 * correct / total_count, 'details': details}
+
    def score(self, predictions: List, references: List,
              test_set: Dataset) -> Dict:
        """Score code generation predictions against references.
@ -221,28 +208,25 @@ class CodeEvaluator(BaseEvaluator):
        test_set = test_set.to_pandas()
        # Use the first column as the unique identifier
        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
-        num_repeats = int(len(test_set) / len(test_set_origin))

        # 1. Prepare data for all test cases
-        all_test_cases = []
+        all_test_cases, prompts = [], []
        for i in range(len(test_set_origin)):
            test_case = test_set_origin.iloc[i]
-            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+            completion = predictions[i]

            # Process code completions
-            processed_completions = self._process_completions(
-                test_case, completions)
-
-            result_dict = {
+            processed_completion = self._process_completions(
+                test_case, completion)
+            code = test_case[
+                'prompt'] + processed_completion + '\n' + test_case['tests']
+            sub_data_dict = {
                'name': test_case['name'],
                'language': test_case['language'],
-                'prompt': test_case['prompt'],
-                'tests': test_case['tests'],
-                'processed_completions': processed_completions,
-                'completions': completions
+                'code': code
            }
-
-            all_test_cases.append(result_dict)
+            all_test_cases.append(sub_data_dict)
+            prompts.append(test_case['prompt'])

        # 2. Send all test cases to the evaluation service
        success, outputs, error_message = self._evaluate(all_test_cases)
@ -250,18 +234,4 @@ class CodeEvaluator(BaseEvaluator):
            return {'error': error_message}

        # 3. Process the returned results
-        details = []
-        correct = 0
-        for output in outputs:
-            if output.get('status') == 'OK':
-                output['correct'] = True
-                correct += 1
-            else:
-                output['correct'] = False
-
-            details.append(output)
-
-        return {
-            f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
-            'details': details
-        }
+        return self._process_results(outputs, prompts, len(test_set_origin))
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -451,7 +451,16 @@ DATASETS_MAPPING = {
        "hf_id": "",
        "local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
    },
-
+    "opencompass/humaneval_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/humaneval_pro/humaneval_pro.json",
+    },
+    "opencompass/mbpp_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/mbpp_pro/mbpp_pro.json",
+    },
 }

 DATASETS_URL = {
@ -808,6 +817,13 @@ DATASETS_URL = {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
        "md5": "e6082cae3596b3ebea73e23ba445b99e"
-    }
-    
+    },
+    "humaneval_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
+        "md5": "4c6fe556e84e905e4f0902d699e46de5",
+    },
+    "mbpp_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
+        "md5": "eac330b8a0a8687f006265c9383503ce",
+    },
 }