Merge b2c84058f2 into 6a6a1a5c0b

2025-05-30 16:03:24 +08:00 · 2025-04-12 14:15:23 +08:00 · 2025-04-12 14:15:23 +08:00 · a3fa2fb105
commit a3fa2fb105
parent 6a6a1a5c0b b2c84058f2
29 changed files with 1039 additions and 75 deletions
--- a/examples/eval_codebench.py
+++ b/examples/eval_codebench.py
@ -0,0 +1,153 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    # Datasets Part
+    # bigcodebench
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
+        bigcodebench_full_instruct_datasets
+    )
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
+        bigcodebench_hard_instruct_datasets
+    )
+    # livecodebench code generation lite v5
+    from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
+        LCB_datasets
+    )
+    # huamneval
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
+        humaneval_datasets
+    )
+    from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
+        humanevalpro_datasets
+    )
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_627de5 import (
+        humanevalx_datasets
+    )
+    # mbpp
+    from opencompass.configs.datasets.mbpp.mbpp_gen import (
+        mbpp_datasets
+    )
+    from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
+        mbpppro_datasets
+    )
+    # multipl-e
+    from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
+        multiple_datasets
+    )
+    # ds1000
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
+        ds1000_datasets
+    )
+
+    # Models Part
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
+        models as lmdeploy_internlm3_8b_instruct_model,
+    )
+
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.ds1000 import (
+        ds1000_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.multipl_e import (
+        multiple_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.humanevalx import (
+        humanevalx_summary_groups,
+    )
+
+# models config
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for model in models:
+    model['max_seq_len'] = 16384
+    model['max_out_len'] = 8192
+
+# datasets config
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+for item in humanevalx_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/humanevalx'
+    item['eval_cfg']['evaluator']['port'] = ''
+for item in ds1000_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/ds1000'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
+
+
+# summary
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+summary_groups.append(
+    {'name': 'humanevalx', 
+    'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
+)
+summarizer = dict(
+    dataset_abbrs = [
+        ['bigcodebench_hard_instruct', 'pass@1'],
+        ['bigcodebench_full_instruct', 'pass@1'],
+        ['lcb_code_generation', 'pass@1'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['mbpp', 'score'],
+        ['humaneval_pro', 'pass@1'],
+        ['mbpp_pro', 'pass@1'],
+        ['multiple', 'naive_average'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        '',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-java',
+        'humanevalx-js',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+        '',
+        'humaneval-multiple-cpp', 
+        'humaneval-multiple-cs', 
+        'humaneval-multiple-go', 
+        'humaneval-multiple-java', 
+        'humaneval-multiple-rb', 
+        'humaneval-multiple-js', 
+        'humaneval-multiple-php', 
+        'humaneval-multiple-r', 
+        'humaneval-multiple-rs', 
+        'humaneval-multiple-sh',
+        '',
+        'mbpp-multiple-cpp', 
+        'mbpp-multiple-cs', 
+        'mbpp-multiple-go', 
+        'mbpp-multiple-java', 
+        'mbpp-multiple-rb', 
+        'mbpp-multiple-js', 
+        'mbpp-multiple-php', 
+        'mbpp-multiple-r', 
+        'mbpp-multiple-rs', 
+        'mbpp-multiple-sh'
+    ],
+    summary_groups=summary_groups,
+)
+
+work_dir = 'outputs/code'
--- a/examples/eval_codebench_passk.py
+++ b/examples/eval_codebench_passk.py
@ -0,0 +1,161 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    # Datasets Part
+    # bigcodebench
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import (
+        bigcodebench_full_instruct_datasets
+    )
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import (
+        bigcodebench_hard_instruct_datasets
+    )
+    # livecodebench code generation lite v5
+    from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import (
+        LCB_datasets
+    )
+    # huamneval
+    from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import (
+        humaneval_datasets
+    )
+    from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
+        humanevalpro_datasets
+    )
+    # mbpp
+    from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import (
+        mbpp_datasets
+    )
+    from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
+        mbpppro_datasets
+    )
+    # multipl-e
+    from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import (
+        multiple_datasets
+    )
+
+    # Models Part
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
+        models as lmdeploy_internlm3_8b_instruct_model,
+    )
+
+# models config
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for model in models:
+    model['max_seq_len'] = 16384
+    model['max_out_len'] = 8192
+
+# datasets config
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+num_repeats = 5
+k = (1, 3, 5)
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
+    # openai pass@k config: the current setting is pass@5 (n=10).
+    if not any(exclude in dataset['abbr'] for exclude in ('mbpp', 'humaneval')):
+        dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats
+    dataset['eval_cfg']['evaluator']['k'] = k
+    dataset['num_repeats'] = num_repeats
+    # dataset['abbr'] += f'_passk'
+
+# summary
+summarizer = dict(
+    dataset_abbrs = [
+        'pass@1',
+        ['bigcodebench_full_instruct_passk', 'pass@1'],
+        ['bigcodebench_hard_instruct_passk', 'pass@1'],
+        ['lcb_code_generation_passk', 'pass@1'],
+        ['openai_humaneval_passk_passk', 'humaneval_pass@1'],
+        ['humaneval_pro_passk', 'pass@1'],
+        ['mbpp_passk_passk', 'pass@1'],
+        ['mbpp_pro_passk', 'pass@1'],
+        ['humaneval-multiple-cpp_passk', 'pass@1'],
+        ['humaneval-multiple-cs_passk', 'pass@1'],
+        ['humaneval-multiple-go_passk', 'pass@1'],
+        ['humaneval-multiple-java_passk', 'pass@1'],
+        ['humaneval-multiple-rb_passk', 'pass@1'],
+        ['humaneval-multiple-js_passk', 'pass@1'],
+        ['humaneval-multiple-php_passk', 'pass@1'],
+        ['humaneval-multiple-r_passk', 'pass@1'],
+        ['humaneval-multiple-rs_passk', 'pass@1'],
+        ['humaneval-multiple-sh_passk', 'pass@1'],
+        ['mbpp-multiple-cpp_passk', 'pass@1'],
+        ['mbpp-multiple-cs_passk', 'pass@1'],
+        ['mbpp-multiple-go_passk', 'pass@1'],
+        ['mbpp-multiple-java_passk', 'pass@1'],
+        ['mbpp-multiple-rb_passk', 'pass@1'],
+        ['mbpp-multiple-js_passk', 'pass@1'],
+        ['mbpp-multiple-php_passk', 'pass@1'],
+        ['mbpp-multiple-r_passk', 'pass@1'],
+        ['mbpp-multiple-rs_passk', 'pass@1'],
+        ['mbpp-multiple-sh_passk', 'pass@1'],
+        '',
+        'pass@3',
+        ['bigcodebench_full_instruct_passk', 'pass@3'],
+        ['bigcodebench_hard_instruct_passk', 'pass@3'],
+        ['lcb_code_generation_passk', 'pass@3'],
+        ['openai_humaneval_passk_passk', 'humaneval_pass@3'],
+        ['humaneval_pro_passk', 'pass@3'],
+        ['mbpp_passk_passk', 'pass@3'],
+        ['mbpp_pro_passk', 'pass@3'],
+        ['humaneval-multiple-cpp_passk', 'pass@3'],
+        ['humaneval-multiple-cs_passk', 'pass@3'],
+        ['humaneval-multiple-go_passk', 'pass@3'],
+        ['humaneval-multiple-java_passk', 'pass@3'],
+        ['humaneval-multiple-rb_passk', 'pass@3'],
+        ['humaneval-multiple-js_passk', 'pass@3'],
+        ['humaneval-multiple-php_passk', 'pass@3'],
+        ['humaneval-multiple-r_passk', 'pass@3'],
+        ['humaneval-multiple-rs_passk', 'pass@3'],
+        ['humaneval-multiple-sh_passk', 'pass@3'],
+        ['mbpp-multiple-cpp_passk', 'pass@3'],
+        ['mbpp-multiple-cs_passk', 'pass@3'],
+        ['mbpp-multiple-go_passk', 'pass@3'],
+        ['mbpp-multiple-java_passk', 'pass@3'],
+        ['mbpp-multiple-rb_passk', 'pass@3'],
+        ['mbpp-multiple-js_passk', 'pass@3'],
+        ['mbpp-multiple-php_passk', 'pass@3'],
+        ['mbpp-multiple-r_passk', 'pass@3'],
+        ['mbpp-multiple-rs_passk', 'pass@3'],
+        ['mbpp-multiple-sh_passk', 'pass@3'],
+        '',
+        'pass@5',
+        ['bigcodebench_full_instruct_passk', 'pass@5'],
+        ['bigcodebench_hard_instruct_passk', 'pass@5'],
+        ['lcb_code_generation_passk', 'pass@5'],
+        ['openai_humaneval_passk_passk', 'humaneval_pass@5'],
+        ['humaneval_pro_passk', 'pass@5'],
+        ['mbpp_passk_passk', 'pass@5'],
+        ['mbpp_pro_passk', 'pass@5'],
+        ['humaneval-multiple-cpp_passk', 'pass@5'],
+        ['humaneval-multiple-cs_passk', 'pass@5'],
+        ['humaneval-multiple-go_passk', 'pass@5'],
+        ['humaneval-multiple-java_passk', 'pass@5'],
+        ['humaneval-multiple-rb_passk', 'pass@5'],
+        ['humaneval-multiple-js_passk', 'pass@5'],
+        ['humaneval-multiple-php_passk', 'pass@5'],
+        ['humaneval-multiple-r_passk', 'pass@5'],
+        ['humaneval-multiple-rs_passk', 'pass@5'],
+        ['humaneval-multiple-sh_passk', 'pass@5'],
+        ['mbpp-multiple-cpp_passk', 'pass@5'],
+        ['mbpp-multiple-cs_passk', 'pass@5'],
+        ['mbpp-multiple-go_passk', 'pass@5'],
+        ['mbpp-multiple-java_passk', 'pass@5'],
+        ['mbpp-multiple-rb_passk', 'pass@5'],
+        ['mbpp-multiple-js_passk', 'pass@5'],
+        ['mbpp-multiple-php_passk', 'pass@5'],
+        ['mbpp-multiple-r_passk', 'pass@5'],
+        ['mbpp-multiple-rs_passk', 'pass@5'],
+        ['mbpp-multiple-sh_passk', 'pass@5'],
+    ],
+)
+
+work_dir = 'outputs/code_passk'
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_c3d5ad.py
@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_full_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api=
+        # 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='full',
+        num_repeats=1,
+        k=1,
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_full_instruct_datasets = [
+    dict(abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2',
+         num_repeats=1,)
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
@ -24,10 +24,12 @@ bigcodebench_hard_eval_cfg = dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
-        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
-        remote_execute_api=
-        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api=
+        # 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
+        num_repeats=1,
+        k=1,
    ),
    pred_role='BOT',
 )
@ -42,5 +44,6 @@ bigcodebench_hard_instruct_datasets = [
        eval_cfg=bigcodebench_hard_eval_cfg,
        release_version='v0.1.2',
        dataset_version='hard',
+        num_repeats=1,
    )
 ]
--- a/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
@ -19,9 +19,9 @@ humaneval_infer_cfg = dict(
    inferencer=dict(type=GenInferencer, max_out_len=512))

 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvalEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator,
+                   k=1),
    pred_role='BOT',
-    k=[1, 10, 100],  # the parameter only for humaneval
    pred_postprocessor=dict(type=humaneval_postprocess_v2),
 )

@ -32,5 +32,6 @@ humaneval_datasets = [
        path='opencompass/humaneval',
        reader_cfg=humaneval_reader_cfg,
        infer_cfg=humaneval_infer_cfg,
-        eval_cfg=humaneval_eval_cfg)
+        eval_cfg=humaneval_eval_cfg,
+        num_repeats=1)
 ]
--- a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
+++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
@ -33,4 +33,4 @@ humaneval_plus_datasets = [
        reader_cfg=humaneval_plus_reader_cfg,
        infer_cfg=humaneval_plus_infer_cfg,
        eval_cfg=humaneval_plus_eval_cfg)
-]
+]
--- a/opencompass/configs/datasets/humaneval_pro/README.md
+++ b/opencompass/configs/datasets/humaneval_pro/README.md
@ -0,0 +1,17 @@
+# HumanEval pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     67   |
+|  deepseek-v2-lite-chat-hf  |     35   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     28   |
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
@ -0,0 +1,60 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+@@ Instruction
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+
+@@ Response
+Please put the two solutions to the above problems in one Python code block.
+"""
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space',
+                   k=1)
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        num_repeats=1,
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,)
+]
--- a/opencompass/configs/datasets/humanevalx/humanevalx_gen_627de5.py
+++ b/opencompass/configs/datasets/humanevalx/humanevalx_gen_627de5.py
@ -0,0 +1,41 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalx_eval_cfg_dict = {
+    lang : dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001),  # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
+        pred_role='BOT')
+    for lang in ['python', 'cpp', 'java', 'js']   # do not support rust & go now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='./data/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg,
+        eval_cfg=humanevalx_eval_cfg_dict[lang])
+    for lang in ['python', 'cpp', 'java', 'js']
+]
--- a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py
@ -33,9 +33,11 @@ lcb_code_generation_eval_cfg = dict(
    evaluator=dict(type=LCBCodeGenerationEvaluator,
                   num_process_evaluate=4,
                   timeout=6,
-                   release_version='release_v5',
+                   release_version='v5',
                   start_date='2024-08-01',
-                   end_date='2025-02-01'),
+                   end_date='2025-02-01',
+                   num_repeats=1,
+                   k=1,),
    pred_role='BOT',
 )

@ -46,7 +48,8 @@ LCBCodeGeneration_dataset = dict(
    reader_cfg=lcb_code_generation_reader_cfg,
    infer_cfg=lcb_code_generation_infer_cfg,
    eval_cfg=lcb_code_generation_eval_cfg,
-    release_version='release_v5',
+    release_version='v5',
+    num_repeats=1,
 )

 # Code Execution Dataset
@ -127,6 +130,6 @@ LCBTestOutput_dataset = dict(

 LCB_datasets = [
    LCBCodeGeneration_dataset,
-    LCBCodeExecution_dataset,
-    LCBTestOutput_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
 ]
--- a/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py
+++ b/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py
@ -28,7 +28,9 @@ mbpp_infer_cfg = dict(
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )

-mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator,
+                                    k=1), 
+                    pred_role='BOT')

 mbpp_datasets = [
    dict(
@ -38,5 +40,6 @@ mbpp_datasets = [
        reader_cfg=mbpp_reader_cfg,
        infer_cfg=mbpp_infer_cfg,
        eval_cfg=mbpp_eval_cfg,
+        num_repeats=1,
    )
 ]
--- a/opencompass/configs/datasets/mbpp_pro/README.md
+++ b/opencompass/configs/datasets/mbpp_pro/README.md
@ -0,0 +1,17 @@
+# MBPP pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     66   |
+|  qwen2.5-14b-instruct-hf   |     64   |
+|  deepseek-v2-lite-chat-hf  |     36   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     39   |
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
@ -0,0 +1,60 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+@@ Instruction
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+
+@@ Response
+Please put the two solutions to the above problems in one Python code block.
+"""
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space',
+                   k=1),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        num_repeats=1,
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg)
+]
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
@ -21,6 +21,7 @@ multiple_eval_cfg = {
        evaluator=dict(
            type=MultiplEEvaluator,
            language=lang,
+            k = 1,
            ip_address='https://opencompass-multiple-evaluator.hf.space',
        ),
        pred_role='BOT',
--- a/opencompass/configs/summarizers/groups/multipl_e.py
+++ b/opencompass/configs/summarizers/groups/multipl_e.py
@ -0,0 +1,6 @@
+multiple_summary_groups = []
+
+humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh']
+mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh']
+multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple})
+multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -62,6 +62,7 @@ from .hle import *  # noqa: F401, F403
 from .huggingface import *  # noqa: F401, F403
 from .humaneval import *  # noqa: F401, F403
 from .humaneval_multi import *  # noqa: F401, F403
+from .humaneval_pro import *  # noqa: F401, F403
 from .humanevalx import *  # noqa: F401, F403
 from .hungarian_math import *  # noqa: F401, F403
 from .IFEval.ifeval import IFEvalDataset, IFEvaluator  # noqa: F401, F403
@ -91,6 +92,7 @@ from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
+from .mbpp_pro import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
 from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@ -4,6 +4,7 @@
 import os
 import time
 from concurrent.futures._base import CancelledError
+from typing import List, Sequence, Tuple, Union

 import httpx
 from datasets import Dataset, DatasetDict
@ -24,7 +25,8 @@ class BigCodeBenchDataset(BaseDataset):
    def load(path: str = 'opencompass/bigcodebench',
             local_mode: bool = False,
             release_version: str = 'v0.1.2',
-             dataset_version: str = 'full'):
+             dataset_version: str = 'full',
+             num_repeats: int = 1):
        """
        Args:
            path (str): The path to the dataset.
@ -33,6 +35,7 @@ class BigCodeBenchDataset(BaseDataset):
            release_version (str): The release version of the dataset.
            dataset_version (str): The data version of the dataset.
                only support ['full', 'hard']
+            num_repeats (int): Number of times to repeat dataset for pass@k.
        """
        assert dataset_version in ['full', 'hard'], \
            'dataset_version should be one of ["full", "hard"], '
@ -45,11 +48,13 @@ class BigCodeBenchDataset(BaseDataset):
        # 'entry_point', 'doc_struct', 'libs'
        if dataset_version == 'full':
            items = JSONToolkit.read_jsonl(
-                os.path.join(path, f'BigCodeBench-{release_version}.jsonl'))
+                os.path.join(path, f'BigCodeBench-{release_version}.jsonl'),
+                num_repeats)
        else:
            items = JSONToolkit.read_jsonl(
                os.path.join(path,
-                             f'BigCodeBench-Hard-{release_version}.jsonl'))
+                             f'BigCodeBench-Hard-{release_version}.jsonl'),
+                num_repeats)

        dataset['train'] = Dataset.from_list(items)
        dataset['test'] = Dataset.from_list(items)
@ -61,10 +66,10 @@ class BigCodeBenchEvaluator(BaseEvaluator):
    """Evaluator for BigCodeBench.

    Args:
-        num_process_evaluate (int): number of processes to evaluate
        timeout (int): timeout for each evaluation
        release_version (str): release version of BigCodeBench
        eval_type (str): type of evaluation, either 'instruct' or 'completion'
+        k (str): pass@k for evaluation
    """

    def __init__(
@ -75,7 +80,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
            dataset_version: str = 'full',
            local_mode: bool = False,
            path: str = 'opencompass/bigcodebench',
+            num_repeats=1,
            pass_k: str = '1,5,10',
+            k: Union[int, Tuple[int, ...], List[int]] = 1,
            parallel: int = -1,
            min_time_limit: float = 1,
            max_as_limit: int = 30 * 1024,
@ -88,12 +95,17 @@ class BigCodeBenchEvaluator(BaseEvaluator):
            release_version=release_version,
            dataset_version=dataset_version,
            local_mode=local_mode,
-            path=path)['test']
+            path=path,
+            num_repeats=num_repeats)['test']
        self.eval_type = eval_type
+        if not isinstance(k, Sequence):
+            k = (k, )
+        k = ', '.join(map(str, k))
+        self.k = k
        self.remote_execute_api = remote_execute_api

        self.eval_kwargs = dict(subset=dataset_version,
-                                pass_k=pass_k,
+                                pass_k=self.k,
                                parallel=parallel,
                                min_time_limit=min_time_limit,
                                max_as_limit=max_as_limit,
@ -141,7 +153,7 @@ class BigCodeBenchEvaluator(BaseEvaluator):
                        signal.alarm(0)
                        signal.signal(signal.SIGALRM, original_handler)

-                with timeout_handler(10):
+                with timeout_handler(300):
                    sanitized_prediction = extract_code_generation(
                        prediction, entrypoint=entrypoint)

@ -188,7 +200,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
        while True:
            try:
                eval_client = Client(self.remote_execute_api,
-                                     httpx_kwargs=dict(proxies=proxies))
+                                     httpx_kwargs=dict(
+                                         proxies=proxies,
+                                         timeout=httpx.Timeout(100.0)))
                results, pass_at_k = eval_client.predict(
                    split=self.eval_type,
                    samples=handle_file(submitted_contents_path),
@ -196,22 +210,25 @@ class BigCodeBenchEvaluator(BaseEvaluator):
                    **self.eval_kwargs)
                break
            except (httpx.ReadTimeout, CancelledError):
-                logger.info('Read timeout error. Retrying in 4s...')
+                logger.info('Read timeout error. Retrying in 10s...')
                time.sleep(10)

-        if 'pass@1' in pass_at_k.keys():
-            pass_at_k['pass@1'] *= 100
-        dump_results = {'details': self._results_processor(results)}
-        dump_results.update(pass_at_k)
-
-        return dump_results
+        pass_at_k = {
+            k: v * 100 if isinstance(v, (int, float)) else v
+            for k, v in pass_at_k.items()
+        }
+        return {
+            **pass_at_k,
+            'details': self._results_processor(results),
+        }

    def _results_processor(self, results):
        details = []
        for key, value in results['eval'].items():
-            if value[0]['status'] == 'pass':
-                value[0]['correct'] = True
-            else:
-                value[0]['correct'] = False
-            details.append(value[0])
+            detail = {'correct': False, 'results_details': value}
+            for v in value:
+                if v['status'] == 'pass':
+                    detail['correct'] = True
+                    break
+            details.append(detail)
        return details
--- a/opencompass/datasets/custom.py
+++ b/opencompass/datasets/custom.py
@ -191,14 +191,19 @@ class CodeCustomDataset(BaseDataset):
        path = get_data_path(path, local_mode=local_mode)
        if file_name is not None:
            path = os.path.join(path, file_name)
+        files = os.listdir(path)
        data = []
-        if path.endswith('.jsonl'):
-            with open(path, 'r', encoding='utf-8') as f:
+        if any(f.endswith('.jsonl') for f in files):
+            target_file = next(f for f in files if f.endswith('.jsonl'))
+            target_path = os.path.join(path, target_file)
+            with open(target_path, 'r', encoding='utf-8') as f:
                for line in f:
                    data.extend(
                        [json.loads(line.strip()) for _ in range(num_repeats)])
-        elif path.endswith('.csv'):
-            with open(path, 'r', encoding='utf-8-sig') as f:
+        elif any(f.endswith('.csv') for f in files):
+            target_file = next(f for f in files if f.endswith('.csv'))
+            target_path = os.path.join(path, target_file)
+            with open(target_path, 'r', encoding='utf-8-sig') as f:
                reader = csv.reader(f)
                header = next(reader)
                for row in reader:
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@ -6,7 +6,7 @@ import os.path as osp
 import re
 import tempfile
 from os import environ
-from typing import List
+from typing import List, Sequence, Tuple, Union

 from datasets import Dataset

@ -70,12 +70,16 @@ class HumanevalDataset(BaseDataset):
 class HumanEvalEvaluator(BaseEvaluator):
    """Evaluator for HumanEval or EvalPlus."""

-    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
+    def __init__(self, k: Union[int, Tuple[int, ...], List[int]] = 1,
+                 num_repeats: int = 1) -> None:
        try:
            import human_eval
        except ImportError:
            raise ImportError(HUMANEVAL_IMPORT_ERROR)

+        self.n = num_repeats
+        if not isinstance(k, Sequence):
+            k = (k, )
        self.k = k
        super().__init__()

@ -87,16 +91,24 @@ class HumanEvalEvaluator(BaseEvaluator):
        from human_eval.evaluation import evaluate_functional_correctness

        prompts = [item['prompt'] for item in test_set]
-        humaneval_preds = []
+        predictions_processed, references_processed = [], []
+        for pred, refer in zip(predictions, references):
+            if references_processed and refer == references_processed[-1]:
+                predictions_processed[-1].extend([pred])
+            else:
+                references_processed.append(refer)
+                predictions_processed.append([pred])
+
        # create json file in human_eval format
-        for preds, refer in zip(predictions, references):
+        humaneval_preds = []
+        for preds_p, refer_p in zip(predictions_processed, references_processed):
            # suits for two case
            # 1. use repeated dataset
            # 2. use `num_return_sequences` to generate multiple responses
-            if not isinstance(preds, list):
-                preds = [preds]
-            for pred in preds:
-                humaneval_preds.append({'task_id': refer, 'completion': pred})
+            if not isinstance(preds_p, list):
+                preds_p = [preds_p]
+            for pred_p in preds_p:
+                humaneval_preds.append({'task_id': refer_p, 'completion': pred_p})
        with tempfile.TemporaryDirectory() as tmp_dir:
            out_dir = osp.join(tmp_dir, 'human_eval.json')
            write_jsonl(out_dir, humaneval_preds)
@ -183,13 +195,13 @@ def humaneval_postprocess_v2(text: str) -> str:
    blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
    if len(blocks) >= 1:
        text = blocks[0]
-    return text
+    return text.lstrip()

 def humaneval_postprocess_v3(text: str) -> str:
    blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
    if len(blocks) >= 1:
        text = blocks[-1]
-    return text
+    return text.lstrip()

 def humaneval_internal_v2_postprocess(text: str):
    if text.startswith('   ') and not text.startswith('    '):
--- a/opencompass/datasets/humaneval_pro.py
+++ b/opencompass/datasets/humaneval_pro.py
@ -0,0 +1,96 @@
+import json
+from typing import Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+class HumanevalevalProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, num_repeats=1, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            raw_data = json.load(f)
+            for data in raw_data:
+                dataset.extend([data for _ in range(num_repeats)])
+        return Dataset.from_list(dataset)
+
+
+class HumanevalProEvaluator(CodeEvaluator):
+
+    def _process_completions(self, test_case: dict, completions: list) -> list:
+        processed_completions = []
+        for comp in completions:
+            post_comp = self._extract_code(comp)
+            processed_completions.append(post_comp)
+        return processed_completions
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+        num_repeats = int(len(test_set) / len(test_set_origin))
+
+        # 1. Prepare data for all test cases
+        all_test_cases = []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+
+            # Process code completions
+            processed_completions = self._process_completions(
+                test_case, completions)
+
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'prompt': '',
+                'tests': test_case['test_code'],
+                'processed_completions': processed_completions,
+                'completions': completions
+            }
+
+            all_test_cases.append(sub_data_dict)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        details = []
+        total, correct = [], []
+        for output in outputs:
+            passed = [m['status'] == 'OK' for m in output['meta_data']]
+            total.append(len(passed))
+            correct.append(sum(passed))
+            details.append(output)
+        total = np.array(total)
+        correct = np.array(correct)
+
+        pass_at_k = {
+            f'pass@{k}':
+            self.estimate_pass_at_k(total, correct, k).mean() * 100
+            for k in self.k if (total >= k).all()
+        }
+
+        return {
+            **pass_at_k,
+            'details': details,
+        }
--- a/opencompass/datasets/humaneval_pro_.py
+++ b/opencompass/datasets/humaneval_pro_.py
@ -0,0 +1,89 @@
+import json
+import os
+
+import evaluate
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import HuggingfaceEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+os.environ['HF_ALLOW_CODE_EVAL'] = '1'
+
+
+class HumanevalevalProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, num_repeats=1, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            raw_data = json.load(f)
+            for data in raw_data:
+                dataset.extend([data for _ in range(num_repeats)])
+        return Dataset.from_list(dataset)
+
+
+class HumanevalProEvaluator(HuggingfaceEvaluator):
+
+    def _preprocess(self, predictions, references):
+        predictions = [[_] for _ in predictions]
+        return {
+            'predictions': predictions,
+            'references': references,
+        }
+
+    def _postprocess(self, scores):
+        scores = {f'humaneval_{k}': scores[k] * 100 for k in scores}
+        return scores
+
+    def score(self, predictions, references, test_set):
+        # predictions are LLM's output; references are the 'output_column' of 'humanevalpro_reader_cfg' # noqa: E501
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        # use codes pre-downloaded to opencompass repo, avoid downloading
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        parrent_dir = os.path.dirname(current_dir)
+        local_path = os.path.join(parrent_dir, 'openicl', 'icl_evaluator',
+                                  'hf_metrics', self.metric)
+
+        if os.path.exists(local_path):
+            metric = evaluate.load(local_path)
+        else:
+            metric = evaluate.load(self.metric)
+        scores, _ = metric.compute(**self._preprocess(predictions, references),
+                                   k=[1, 3, 5],
+                                   num_workers=4)
+        result = self._postprocess(scores)
+        return result
+
+
+def humanevalpro_postprocess_official(text):
+    """The official post-processing method for humaneval_pro, which is solely
+    applicable to the complete generation paradigm.
+
+    # noqa: E501 The chat template paradigm requires a different post-
+    processing method.
+    """
+    text = text[:index if (index := text.find('```')) != -1 else len(text)]
+    return text
+
+
+def humanevalpro_postprocess_oc(text):
+    """For those generated based on the chat template paradigm, this method is
+    recommended.
+
+    # noqa: E501
+    """
+    start = text.rfind('```python') + len('```python')
+    end = text.find('```', start)
+
+    code = text[start:end].strip()
+    return code
--- a/opencompass/datasets/humanevalx.py
+++ b/opencompass/datasets/humanevalx.py
@ -90,7 +90,7 @@ class HumanevalXEvaluator(BaseEvaluator):
        self.timeout = timeout
        super().__init__()

-    def score(self, predictions, references):
+    def score(self, predictions, references, test_set):
        predictions = [{
            'task_id':
            f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
--- a/opencompass/datasets/livecodebench/evaluator.py
+++ b/opencompass/datasets/livecodebench/evaluator.py
@ -3,6 +3,7 @@ import json
 import multiprocessing
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import Sequence

 import numpy as np
 from tqdm import tqdm
@ -174,7 +175,7 @@ def codegen_metrics(
    samples_list,
    generations_list,
    k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000],
-    num_process_evaluate=16,
+    num_process_evaluate=8,
    timeout=6,
    debug=False,
 ):
@ -238,14 +239,20 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
                 release_version='release_v1',
                 extractor_version='v1',
                 start_date=None,
-                 end_date=None):
+                 end_date=None,
+                 num_repeats=1,
+                 k=1):
        super().__init__()
        self.num_process_evaluate = num_process_evaluate
        self.timeout = timeout
+        if not isinstance(k, Sequence):
+            k = (k, )
+        self.k = k
        self.dataset = LCBCodeGenerationDataset.load(
            release_version=release_version,
            start_date=start_date,
-            end_date=end_date)['test']
+            end_date=end_date,
+            num_repeats=num_repeats)['test']
        self.extractor_version = extractor_version

    def score(self, predictions, references):
@ -273,8 +280,11 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
        filtered_references = []
        for idx, item in enumerate(references):
            if item in self.dataset['question_id']:
-                filtered_predictions.append(predictions[idx])
-                filtered_references.append(item)
+                if filtered_references and item == filtered_references[-1]:
+                    filtered_predictions[-1].extend(predictions[idx])
+                else:
+                    filtered_predictions.append(predictions[idx])
+                    filtered_references.append(item)

        filtered_references = [
            evaluation_samples[item] for item in filtered_references
@ -291,7 +301,7 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
        metrics, eval_results, final_metadata = codegen_metrics(
            filtered_references,
            filtered_predictions,
-            k_list=[1],
+            k_list=self.k,
            num_process_evaluate=self.num_process_evaluate,
            timeout=self.timeout,
        )
--- a/opencompass/datasets/livecodebench/livecodebench.py
+++ b/opencompass/datasets/livecodebench/livecodebench.py
@ -56,7 +56,8 @@ class LCBCodeGenerationDataset(BaseDataset):
             local_mode: bool = False,
             release_version: str = 'release_v1',
             start_date: str = None,
-             end_date: str = None):
+             end_date: str = None,
+             num_repeats: int = None):

        def transform(item):
            # Define the dataitem mapping logic
@ -118,7 +119,13 @@ class LCBCodeGenerationDataset(BaseDataset):
        if end_date is not None:
            p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
            dataset = dataset.filter(lambda e: datetime.fromisoformat(e[
-                'contest_date']) <= p_end_date)  # noqa: E501
+                'contest_date']) <= p_end_date)
+
+        if num_repeats and num_repeats > 1:
+            indices = []
+            for idx in range(len(dataset)):
+                indices.extend([idx] * num_repeats)
+            dataset = dataset.select(indices)

        return DatasetDict({'test': dataset, 'train': dataset})

--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@ -436,7 +436,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
    """Better use for pass k evaluation.

    Args:
-        k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100)
+        k(Union[int, Tuple[int, ...], List[int]]): Choices of Pass@k.
    """

    def __init__(self, k=(1, 10, 100)) -> None:
@ -478,7 +478,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
        task_total = defaultdict(int)

        result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
-        with ProcessPoolExecutor() as executor:
+        with ProcessPoolExecutor(max_workers=8) as executor:
            futures = []
            for refer, preds in zip(references, predictions):
                # suits for two case
@ -494,7 +494,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
                for pred in preds:
                    pred = self._process_answer(pred)
                    programs = self._process_test(test_case, pred)
-                    future = executor.submit(execution, programs, task_id, 10)
+                    future = executor.submit(execution, programs, task_id, 8)
                    futures.append(future)

            from tqdm import tqdm
--- a/opencompass/datasets/mbpp_pro.py
+++ b/opencompass/datasets/mbpp_pro.py
@ -0,0 +1,97 @@
+import json
+from typing import Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+class MBPPProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, num_repeats=1, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        print(path)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            for line in f:
+                dataset.extend(
+                    [json.loads(line.strip()) for _ in range(num_repeats)])
+        return Dataset.from_list(dataset)
+
+
+class MBPPProEvaluator(CodeEvaluator):
+
+    def _process_completions(self, test_case: dict, completions: list) -> list:
+        processed_completions = []
+        for comp in completions:
+            post_comp = self._extract_code(comp)
+            processed_completions.append(post_comp)
+        return processed_completions
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+        num_repeats = int(len(test_set) / len(test_set_origin))
+
+        # 1. Prepare data for all test cases
+        all_test_cases = []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+
+            # Process code completions
+            processed_completions = self._process_completions(
+                test_case, completions)
+
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'prompt': '',
+                'tests': test_case['test_code'],
+                'processed_completions': processed_completions,
+                'completions': completions
+            }
+
+            all_test_cases.append(sub_data_dict)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        details = []
+        total, correct = [], []
+        for output in outputs:
+            passed = [m['status'] == 'OK' for m in output['meta_data']]
+            total.append(len(passed))
+            correct.append(sum(passed))
+            details.append(output)
+        total = np.array(total)
+        correct = np.array(correct)
+
+        pass_at_k = {
+            f'pass@{k}':
+            self.estimate_pass_at_k(total, correct, k).mean() * 100
+            for k in self.k if (total >= k).all()
+        }
+
+        return {
+            **pass_at_k,
+            'details': details,
+        }
--- a/opencompass/openicl/icl_evaluator/code_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/code_evaluator.py
@ -1,12 +1,14 @@
 # flake8: noqa: E501

 import difflib
+import itertools
 import os
 import re
 import tempfile
 import time
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

+import numpy as np
 from datasets import Dataset
 from gradio_client import Client

@ -24,19 +26,24 @@ class CodeEvaluator(BaseEvaluator):
    """

    def __init__(self,
-                 language: str,
+                 language: str = 'py',
                 ip_address: str = 'localhost',
+                 k: Union[int, Tuple[int, ...], List[int]] = 1,
                 retry: int = 3) -> None:
        """Initialize the CodeEvaluator.

        Args:
            language (str): Programming language of the code to evaluate.
            ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
+            k: Union[int, Tuple[int,...], List[int,...]]: The number k of pass@k to evaluate the code. Defaults to 1.
            retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
        """
        self.language = language
        self.retry = retry
        self.client = Client(ip_address)
+        if not isinstance(k, Sequence):
+            k = (k, )
+        self.k = k
        super().__init__()

    def _extract_code(self, text: str) -> str:
@ -195,6 +202,31 @@ class CodeEvaluator(BaseEvaluator):

        return True, output, None

+    def estimate_pass_at_k(self, num_samples: Union[int, List[int],
+                                                    np.ndarray],
+                           num_correct: Union[List[int], np.ndarray],
+                           k: int) -> np.ndarray:
+        """Estimates pass@k of each problem and returns them in an array."""
+
+        def estimator(n: int, c: int, k: int) -> float:
+            """
+            Calculates 1 - comb(n - c, k) / comb(n, k).
+            """
+            if n - c < k:
+                return 1.0
+            return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+        if isinstance(num_samples, int):
+            num_samples_it = itertools.repeat(num_samples, len(num_correct))
+        else:
+            assert len(num_samples) == len(num_correct)
+            num_samples_it = iter(num_samples)
+
+        return np.array([
+            estimator(int(n), int(c), k)
+            for n, c in zip(num_samples_it, num_correct)
+        ])
+
    def score(self, predictions: List, references: List,
              test_set: Dataset) -> Dict:
        """Score code generation predictions against references.
@ -233,7 +265,7 @@ class CodeEvaluator(BaseEvaluator):
            processed_completions = self._process_completions(
                test_case, completions)

-            result_dict = {
+            sub_data_dict = {
                'name': test_case['name'],
                'language': test_case['language'],
                'prompt': test_case['prompt'],
@ -242,7 +274,7 @@ class CodeEvaluator(BaseEvaluator):
                'completions': completions
            }

-            all_test_cases.append(result_dict)
+            all_test_cases.append(sub_data_dict)

        # 2. Send all test cases to the evaluation service
        success, outputs, error_message = self._evaluate(all_test_cases)
@ -251,17 +283,22 @@ class CodeEvaluator(BaseEvaluator):

        # 3. Process the returned results
        details = []
-        correct = 0
+        total, correct = [], []
        for output in outputs:
-            if output.get('status') == 'OK':
-                output['correct'] = True
-                correct += 1
-            else:
-                output['correct'] = False
-
+            passed = [m['status'] == 'OK' for m in output['meta_data']]
+            total.append(len(passed))
+            correct.append(sum(passed))
            details.append(output)
+        total = np.array(total)
+        correct = np.array(correct)
+
+        pass_at_k = {
+            f'pass@{k}':
+            self.estimate_pass_at_k(total, correct, k).mean() * 100
+            for k in self.k if (total >= k).all()
+        }

        return {
-            f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
-            'details': details
+            **pass_at_k,
+            'details': details,
        }
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -420,6 +420,16 @@ DATASETS_MAPPING = {
        "hf_id": "",
        "local": "./data/OlympiadBench",
    },
+    "opencompass/humaneval_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/humaneval_pro/humaneval_pro.json",
+    },
+    "opencompass/mbpp_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/mbpp_pro/mbpp_pro.json",
+    },
 }

 DATASETS_URL = {
@ -746,5 +756,13 @@ DATASETS_URL = {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
        "md5": "270f399f4142b74f47ecff116cc3b21d"
-    }
+    },
+    "humaneval_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
+        "md5": "4c6fe556e84e905e4f0902d699e46de5",
+    },
+    "mbpp_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
+        "md5": "eac330b8a0a8687f006265c9383503ce",
+    },
 }
--- a/opencompass/utils/fileio.py
+++ b/opencompass/utils/fileio.py
@ -51,7 +51,8 @@ class JSONToolkit:
            raise

    @staticmethod
-    def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]:
+    def read_jsonl(file_path: Union[str, Path],
+                   num_repeats: int = 1) -> List[Dict[str, Any]]:
        """Read a JSONL file and return its contents as a list of dictionaries.

        Args:
@ -73,7 +74,9 @@ class JSONToolkit:
                    if not line:  # Skip empty lines
                        continue
                    try:
-                        results.append(json.loads(line))
+                        # results.append(json.loads(line))
+                        results.extend(
+                            [json.loads(line) for _ in range(num_repeats)])
                    except json.JSONDecodeError as e:
                        logger.error(
                            f'Invalid JSON on line {line_num}: {str(e)}')