diff --git a/examples/eval_codebench.py b/examples/eval_codebench.py new file mode 100644 index 00000000..fdc4aa20 --- /dev/null +++ b/examples/eval_codebench.py @@ -0,0 +1,153 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.runners import LocalRunner, VOLCRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets Part + # bigcodebench + from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import ( + bigcodebench_full_instruct_datasets + ) + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import ( + bigcodebench_hard_instruct_datasets + ) + # livecodebench code generation lite v5 + from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import ( + LCB_datasets + ) + # huamneval + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import ( + humaneval_datasets + ) + from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import ( + humanevalpro_datasets + ) + from opencompass.configs.datasets.humanevalx.humanevalx_gen_627de5 import ( + humanevalx_datasets + ) + # mbpp + from opencompass.configs.datasets.mbpp.mbpp_gen import ( + mbpp_datasets + ) + from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import ( + mbpppro_datasets + ) + # multipl-e + from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import ( + multiple_datasets + ) + # ds1000 + from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import ( + ds1000_datasets + ) + + # Models Part + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct_model, + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import ( + models as lmdeploy_internlm3_8b_instruct_model, + ) + + # Summary Groups + from opencompass.configs.summarizers.groups.ds1000 import ( + ds1000_summary_groups, + ) + from opencompass.configs.summarizers.groups.multipl_e import ( + multiple_summary_groups, + ) + from opencompass.configs.summarizers.groups.humanevalx import ( + humanevalx_summary_groups, + ) + +# models config +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +for model in models: + model['max_seq_len'] = 16384 + model['max_out_len'] = 8192 + +# datasets config +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), + [], +) + +for item in humanevalx_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address' + ] = 'codeeval.opencompass.org.cn/humanevalx' + item['eval_cfg']['evaluator']['port'] = '' +for item in ds1000_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address' + ] = 'codeeval.opencompass.org.cn/ds1000' + item['eval_cfg']['evaluator']['port'] = '' + + +for dataset in datasets: + dataset['infer_cfg']['inferencer']['max_out_len'] = 8192 + + +# summary +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] +) +summary_groups.append( + {'name': 'humanevalx', + 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']} +) +summarizer = dict( + dataset_abbrs = [ + ['bigcodebench_hard_instruct', 'pass@1'], + ['bigcodebench_full_instruct', 'pass@1'], + ['lcb_code_generation', 'pass@1'], + ['openai_humaneval', 'humaneval_pass@1'], + ['mbpp', 'score'], + ['humaneval_pro', 'pass@1'], + ['mbpp_pro', 'pass@1'], + ['multiple', 'naive_average'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + '', + 'humanevalx-python', + 'humanevalx-cpp', + 'humanevalx-java', + 'humanevalx-js', + '', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib', + '', + 'humaneval-multiple-cpp', + 'humaneval-multiple-cs', + 'humaneval-multiple-go', + 'humaneval-multiple-java', + 'humaneval-multiple-rb', + 'humaneval-multiple-js', + 'humaneval-multiple-php', + 'humaneval-multiple-r', + 'humaneval-multiple-rs', + 'humaneval-multiple-sh', + '', + 'mbpp-multiple-cpp', + 'mbpp-multiple-cs', + 'mbpp-multiple-go', + 'mbpp-multiple-java', + 'mbpp-multiple-rb', + 'mbpp-multiple-js', + 'mbpp-multiple-php', + 'mbpp-multiple-r', + 'mbpp-multiple-rs', + 'mbpp-multiple-sh' + ], + summary_groups=summary_groups, +) + +work_dir = 'outputs/code' diff --git a/examples/eval_codebench_passk.py b/examples/eval_codebench_passk.py new file mode 100644 index 00000000..0ffd3e6f --- /dev/null +++ b/examples/eval_codebench_passk.py @@ -0,0 +1,161 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.runners import LocalRunner, VOLCRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets Part + # bigcodebench + from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen_c3d5ad import ( + bigcodebench_full_instruct_datasets + ) + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_c3d5ad import ( + bigcodebench_hard_instruct_datasets + ) + # livecodebench code generation lite v5 + from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import ( + LCB_datasets + ) + # huamneval + from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import ( + humaneval_datasets + ) + from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import ( + humanevalpro_datasets + ) + # mbpp + from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import ( + mbpp_datasets + ) + from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import ( + mbpppro_datasets + ) + # multipl-e + from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import ( + multiple_datasets + ) + + # Models Part + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct_model, + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import ( + models as lmdeploy_internlm3_8b_instruct_model, + ) + +# models config +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) +for model in models: + model['max_seq_len'] = 16384 + model['max_out_len'] = 8192 + +# datasets config +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), + [], +) +num_repeats = 5 +k = (1, 3, 5) +for dataset in datasets: + dataset['infer_cfg']['inferencer']['max_out_len'] = 8192 + # openai pass@k config: the current setting is pass@5 (n=10). + if not any(exclude in dataset['abbr'] for exclude in ('mbpp', 'humaneval')): + dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats + dataset['eval_cfg']['evaluator']['k'] = k + dataset['num_repeats'] = num_repeats + # dataset['abbr'] += f'_passk' + +# summary +summarizer = dict( + dataset_abbrs = [ + 'pass@1', + ['bigcodebench_full_instruct_passk', 'pass@1'], + ['bigcodebench_hard_instruct_passk', 'pass@1'], + ['lcb_code_generation_passk', 'pass@1'], + ['openai_humaneval_passk_passk', 'humaneval_pass@1'], + ['humaneval_pro_passk', 'pass@1'], + ['mbpp_passk_passk', 'pass@1'], + ['mbpp_pro_passk', 'pass@1'], + ['humaneval-multiple-cpp_passk', 'pass@1'], + ['humaneval-multiple-cs_passk', 'pass@1'], + ['humaneval-multiple-go_passk', 'pass@1'], + ['humaneval-multiple-java_passk', 'pass@1'], + ['humaneval-multiple-rb_passk', 'pass@1'], + ['humaneval-multiple-js_passk', 'pass@1'], + ['humaneval-multiple-php_passk', 'pass@1'], + ['humaneval-multiple-r_passk', 'pass@1'], + ['humaneval-multiple-rs_passk', 'pass@1'], + ['humaneval-multiple-sh_passk', 'pass@1'], + ['mbpp-multiple-cpp_passk', 'pass@1'], + ['mbpp-multiple-cs_passk', 'pass@1'], + ['mbpp-multiple-go_passk', 'pass@1'], + ['mbpp-multiple-java_passk', 'pass@1'], + ['mbpp-multiple-rb_passk', 'pass@1'], + ['mbpp-multiple-js_passk', 'pass@1'], + ['mbpp-multiple-php_passk', 'pass@1'], + ['mbpp-multiple-r_passk', 'pass@1'], + ['mbpp-multiple-rs_passk', 'pass@1'], + ['mbpp-multiple-sh_passk', 'pass@1'], + '', + 'pass@3', + ['bigcodebench_full_instruct_passk', 'pass@3'], + ['bigcodebench_hard_instruct_passk', 'pass@3'], + ['lcb_code_generation_passk', 'pass@3'], + ['openai_humaneval_passk_passk', 'humaneval_pass@3'], + ['humaneval_pro_passk', 'pass@3'], + ['mbpp_passk_passk', 'pass@3'], + ['mbpp_pro_passk', 'pass@3'], + ['humaneval-multiple-cpp_passk', 'pass@3'], + ['humaneval-multiple-cs_passk', 'pass@3'], + ['humaneval-multiple-go_passk', 'pass@3'], + ['humaneval-multiple-java_passk', 'pass@3'], + ['humaneval-multiple-rb_passk', 'pass@3'], + ['humaneval-multiple-js_passk', 'pass@3'], + ['humaneval-multiple-php_passk', 'pass@3'], + ['humaneval-multiple-r_passk', 'pass@3'], + ['humaneval-multiple-rs_passk', 'pass@3'], + ['humaneval-multiple-sh_passk', 'pass@3'], + ['mbpp-multiple-cpp_passk', 'pass@3'], + ['mbpp-multiple-cs_passk', 'pass@3'], + ['mbpp-multiple-go_passk', 'pass@3'], + ['mbpp-multiple-java_passk', 'pass@3'], + ['mbpp-multiple-rb_passk', 'pass@3'], + ['mbpp-multiple-js_passk', 'pass@3'], + ['mbpp-multiple-php_passk', 'pass@3'], + ['mbpp-multiple-r_passk', 'pass@3'], + ['mbpp-multiple-rs_passk', 'pass@3'], + ['mbpp-multiple-sh_passk', 'pass@3'], + '', + 'pass@5', + ['bigcodebench_full_instruct_passk', 'pass@5'], + ['bigcodebench_hard_instruct_passk', 'pass@5'], + ['lcb_code_generation_passk', 'pass@5'], + ['openai_humaneval_passk_passk', 'humaneval_pass@5'], + ['humaneval_pro_passk', 'pass@5'], + ['mbpp_passk_passk', 'pass@5'], + ['mbpp_pro_passk', 'pass@5'], + ['humaneval-multiple-cpp_passk', 'pass@5'], + ['humaneval-multiple-cs_passk', 'pass@5'], + ['humaneval-multiple-go_passk', 'pass@5'], + ['humaneval-multiple-java_passk', 'pass@5'], + ['humaneval-multiple-rb_passk', 'pass@5'], + ['humaneval-multiple-js_passk', 'pass@5'], + ['humaneval-multiple-php_passk', 'pass@5'], + ['humaneval-multiple-r_passk', 'pass@5'], + ['humaneval-multiple-rs_passk', 'pass@5'], + ['humaneval-multiple-sh_passk', 'pass@5'], + ['mbpp-multiple-cpp_passk', 'pass@5'], + ['mbpp-multiple-cs_passk', 'pass@5'], + ['mbpp-multiple-go_passk', 'pass@5'], + ['mbpp-multiple-java_passk', 'pass@5'], + ['mbpp-multiple-rb_passk', 'pass@5'], + ['mbpp-multiple-js_passk', 'pass@5'], + ['mbpp-multiple-php_passk', 'pass@5'], + ['mbpp-multiple-r_passk', 'pass@5'], + ['mbpp-multiple-rs_passk', 'pass@5'], + ['mbpp-multiple-sh_passk', 'pass@5'], + ], +) + +work_dir = 'outputs/code_passk' diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_c3d5ad.py new file mode 100644 index 00000000..ed123d45 --- /dev/null +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_c3d5ad.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator) + +bigcodebench_full_reader_cfg = dict( + input_columns=['instruct_prompt'], + output_column='test', +) + +bigcodebench_full_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='system', fallback_role='HUMAN', prompt='')], + round=[ + dict(role='HUMAN', prompt='{instruct_prompt}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +bigcodebench_full_eval_cfg = dict( + evaluator=dict( + type=BigCodeBenchEvaluator, + release_version='v0.1.2', + eval_type='instruct', + remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api= + # 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 + dataset_version='full', + num_repeats=1, + k=1, + ), + pred_role='BOT', +) + +bigcodebench_full_instruct_datasets = [ + dict(abbr='bigcodebench_full_instruct', + type=BigCodeBenchDataset, + path='opencompass/bigcodebench', + reader_cfg=bigcodebench_full_reader_cfg, + infer_cfg=bigcodebench_full_infer_cfg, + eval_cfg=bigcodebench_full_eval_cfg, + release_version='v0.1.2', + num_repeats=1,) +] diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py index 4af844fd..d1abd3a4 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py @@ -24,10 +24,12 @@ bigcodebench_hard_eval_cfg = dict( type=BigCodeBenchEvaluator, release_version='v0.1.2', eval_type='instruct', - # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', - remote_execute_api= - 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 + remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', + # remote_execute_api= + # 'https://opencompass-opencompass-bigcodebench-evaluator.hf.space', # noqa: E501 dataset_version='hard', + num_repeats=1, + k=1, ), pred_role='BOT', ) @@ -42,5 +44,6 @@ bigcodebench_hard_instruct_datasets = [ eval_cfg=bigcodebench_hard_eval_cfg, release_version='v0.1.2', dataset_version='hard', + num_repeats=1, ) ] diff --git a/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py b/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py index 6224696f..8a0cb3e0 100644 --- a/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +++ b/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py @@ -19,9 +19,9 @@ humaneval_infer_cfg = dict( inferencer=dict(type=GenInferencer, max_out_len=512)) humaneval_eval_cfg = dict( - evaluator=dict(type=HumanEvalEvaluator), + evaluator=dict(type=HumanEvalEvaluator, + k=1), pred_role='BOT', - k=[1, 10, 100], # the parameter only for humaneval pred_postprocessor=dict(type=humaneval_postprocess_v2), ) @@ -32,5 +32,6 @@ humaneval_datasets = [ path='opencompass/humaneval', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, - eval_cfg=humaneval_eval_cfg) + eval_cfg=humaneval_eval_cfg, + num_repeats=1) ] diff --git a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py index d602d73b..950a6737 100644 --- a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py +++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py @@ -33,4 +33,4 @@ humaneval_plus_datasets = [ reader_cfg=humaneval_plus_reader_cfg, infer_cfg=humaneval_plus_infer_cfg, eval_cfg=humaneval_plus_eval_cfg) -] +] \ No newline at end of file diff --git a/opencompass/configs/datasets/humaneval_pro/README.md b/opencompass/configs/datasets/humaneval_pro/README.md new file mode 100644 index 00000000..558cf6f1 --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/README.md @@ -0,0 +1,17 @@ +# HumanEval pro + +## OC results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 67 | +| deepseek-v2-lite-chat-hf | 35 | + +## CodeEval-pro results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 65 | +| deepseek-v2-lite-chat-hf | 28 | diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py new file mode 100644 index 00000000..e6c5946d --- /dev/null +++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py @@ -0,0 +1,60 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2 + +OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +@@ Instruction +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` + +@@ Response +Please put the two solutions to the above problems in one Python code block. +""" + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` + +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +humanevalpro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +humanevalpro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humanevalpro_eval_cfg = dict( + evaluator=dict(type=HumanevalProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space', + k=1) +) + +humanevalpro_datasets = [ + dict( + abbr='humaneval_pro', + type=HumanevalevalProDataset, + path='opencompass/humaneval_pro', + num_repeats=1, + reader_cfg=humanevalpro_reader_cfg, + infer_cfg=humanevalpro_infer_cfg, + eval_cfg=humanevalpro_eval_cfg,) +] diff --git a/opencompass/configs/datasets/humanevalx/humanevalx_gen_627de5.py b/opencompass/configs/datasets/humanevalx/humanevalx_gen_627de5.py new file mode 100644 index 00000000..59fa91e1 --- /dev/null +++ b/opencompass/configs/datasets/humanevalx/humanevalx_gen_627de5.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator + +humanevalx_reader_cfg = dict( + input_columns=['prompt'], output_column='declaration', train_split='test') + +humanevalx_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +humanevalx_eval_cfg_dict = { + lang : dict( + evaluator=dict( + type=HumanevalXEvaluator, + language=lang, + ip_address= + 'localhost', # replace to your code_eval_server ip_address, port + port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server + pred_role='BOT') + for lang in ['python', 'cpp', 'java', 'js'] # do not support rust & go now +} + +# Please download the needed `xx.jsonl.gz` from +# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx +# and move them into `data/humanevalx/` folder +humanevalx_datasets = [ + dict( + type=HumanevalXDataset, + abbr=f'humanevalx-{lang}', + language=lang, + path='./data/humanevalx', + reader_cfg=humanevalx_reader_cfg, + infer_cfg=humanevalx_infer_cfg, + eval_cfg=humanevalx_eval_cfg_dict[lang]) + for lang in ['python', 'cpp', 'java', 'js'] +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py index 89bd9eb1..62aee5d4 100644 --- a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py +++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py @@ -33,9 +33,11 @@ lcb_code_generation_eval_cfg = dict( evaluator=dict(type=LCBCodeGenerationEvaluator, num_process_evaluate=4, timeout=6, - release_version='release_v5', + release_version='v5', start_date='2024-08-01', - end_date='2025-02-01'), + end_date='2025-02-01', + num_repeats=1, + k=1,), pred_role='BOT', ) @@ -46,7 +48,8 @@ LCBCodeGeneration_dataset = dict( reader_cfg=lcb_code_generation_reader_cfg, infer_cfg=lcb_code_generation_infer_cfg, eval_cfg=lcb_code_generation_eval_cfg, - release_version='release_v5', + release_version='v5', + num_repeats=1, ) # Code Execution Dataset @@ -127,6 +130,6 @@ LCBTestOutput_dataset = dict( LCB_datasets = [ LCBCodeGeneration_dataset, - LCBCodeExecution_dataset, - LCBTestOutput_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, ] diff --git a/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py b/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py index af5a1057..5221a773 100644 --- a/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py +++ b/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py @@ -28,7 +28,9 @@ mbpp_infer_cfg = dict( inferencer=dict(type=GenInferencer, max_out_len=512), ) -mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator, + k=1), + pred_role='BOT') mbpp_datasets = [ dict( @@ -38,5 +40,6 @@ mbpp_datasets = [ reader_cfg=mbpp_reader_cfg, infer_cfg=mbpp_infer_cfg, eval_cfg=mbpp_eval_cfg, + num_repeats=1, ) ] diff --git a/opencompass/configs/datasets/mbpp_pro/README.md b/opencompass/configs/datasets/mbpp_pro/README.md new file mode 100644 index 00000000..85699dce --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/README.md @@ -0,0 +1,17 @@ +# MBPP pro + +## OC results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 66 | +| qwen2.5-14b-instruct-hf | 64 | +| deepseek-v2-lite-chat-hf | 36 | + +## CodeEval-pro results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 65 | +| deepseek-v2-lite-chat-hf | 39 | diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py new file mode 100644 index 00000000..a32b8d21 --- /dev/null +++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py @@ -0,0 +1,60 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPProDataset, MBPPProEvaluator + +OFFICIAL_PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +@@ Instruction +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` + +@@ Response +Please put the two solutions to the above problems in one Python code block. +""" + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` + +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +mbpppro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +mbpppro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpppro_eval_cfg = dict( + evaluator=dict(type=MBPPProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space', + k=1), +) + +mbpppro_datasets = [ + dict( + abbr='mbpp_pro', + type=MBPPProDataset, + path='opencompass/mbpp_pro', + num_repeats=1, + reader_cfg=mbpppro_reader_cfg, + infer_cfg=mbpppro_infer_cfg, + eval_cfg=mbpppro_eval_cfg) +] diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py index 93ab2962..2001f8da 100644 --- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py +++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py @@ -21,6 +21,7 @@ multiple_eval_cfg = { evaluator=dict( type=MultiplEEvaluator, language=lang, + k = 1, ip_address='https://opencompass-multiple-evaluator.hf.space', ), pred_role='BOT', diff --git a/opencompass/configs/summarizers/groups/multipl_e.py b/opencompass/configs/summarizers/groups/multipl_e.py new file mode 100644 index 00000000..1d50c7b6 --- /dev/null +++ b/opencompass/configs/summarizers/groups/multipl_e.py @@ -0,0 +1,6 @@ +multiple_summary_groups = [] + +humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh'] +mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh'] +multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple}) +multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple}) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 3e2d0eef..4123701b 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -62,6 +62,7 @@ from .hle import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403 from .humaneval_multi import * # noqa: F401, F403 +from .humaneval_pro import * # noqa: F401, F403 from .humanevalx import * # noqa: F401, F403 from .hungarian_math import * # noqa: F401, F403 from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403 @@ -91,6 +92,7 @@ from .math401 import * # noqa: F401, F403 from .math_intern import * # noqa: F401, F403 from .mathbench import * # noqa: F401, F403 from .mbpp import * # noqa: F401, F403 +from .mbpp_pro import * # noqa: F401, F403 from .medbench import * # noqa: F401, F403 from .MedXpertQA import * # noqa: F401, F403 from .mgsm import * # noqa: F401, F403 diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index 9ce3d196..981d35ec 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -4,6 +4,7 @@ import os import time from concurrent.futures._base import CancelledError +from typing import List, Sequence, Tuple, Union import httpx from datasets import Dataset, DatasetDict @@ -24,7 +25,8 @@ class BigCodeBenchDataset(BaseDataset): def load(path: str = 'opencompass/bigcodebench', local_mode: bool = False, release_version: str = 'v0.1.2', - dataset_version: str = 'full'): + dataset_version: str = 'full', + num_repeats: int = 1): """ Args: path (str): The path to the dataset. @@ -33,6 +35,7 @@ class BigCodeBenchDataset(BaseDataset): release_version (str): The release version of the dataset. dataset_version (str): The data version of the dataset. only support ['full', 'hard'] + num_repeats (int): Number of times to repeat dataset for pass@k. """ assert dataset_version in ['full', 'hard'], \ 'dataset_version should be one of ["full", "hard"], ' @@ -45,11 +48,13 @@ class BigCodeBenchDataset(BaseDataset): # 'entry_point', 'doc_struct', 'libs' if dataset_version == 'full': items = JSONToolkit.read_jsonl( - os.path.join(path, f'BigCodeBench-{release_version}.jsonl')) + os.path.join(path, f'BigCodeBench-{release_version}.jsonl'), + num_repeats) else: items = JSONToolkit.read_jsonl( os.path.join(path, - f'BigCodeBench-Hard-{release_version}.jsonl')) + f'BigCodeBench-Hard-{release_version}.jsonl'), + num_repeats) dataset['train'] = Dataset.from_list(items) dataset['test'] = Dataset.from_list(items) @@ -61,10 +66,10 @@ class BigCodeBenchEvaluator(BaseEvaluator): """Evaluator for BigCodeBench. Args: - num_process_evaluate (int): number of processes to evaluate timeout (int): timeout for each evaluation release_version (str): release version of BigCodeBench eval_type (str): type of evaluation, either 'instruct' or 'completion' + k (str): pass@k for evaluation """ def __init__( @@ -75,7 +80,9 @@ class BigCodeBenchEvaluator(BaseEvaluator): dataset_version: str = 'full', local_mode: bool = False, path: str = 'opencompass/bigcodebench', + num_repeats=1, pass_k: str = '1,5,10', + k: Union[int, Tuple[int, ...], List[int]] = 1, parallel: int = -1, min_time_limit: float = 1, max_as_limit: int = 30 * 1024, @@ -88,12 +95,17 @@ class BigCodeBenchEvaluator(BaseEvaluator): release_version=release_version, dataset_version=dataset_version, local_mode=local_mode, - path=path)['test'] + path=path, + num_repeats=num_repeats)['test'] self.eval_type = eval_type + if not isinstance(k, Sequence): + k = (k, ) + k = ', '.join(map(str, k)) + self.k = k self.remote_execute_api = remote_execute_api self.eval_kwargs = dict(subset=dataset_version, - pass_k=pass_k, + pass_k=self.k, parallel=parallel, min_time_limit=min_time_limit, max_as_limit=max_as_limit, @@ -141,7 +153,7 @@ class BigCodeBenchEvaluator(BaseEvaluator): signal.alarm(0) signal.signal(signal.SIGALRM, original_handler) - with timeout_handler(10): + with timeout_handler(300): sanitized_prediction = extract_code_generation( prediction, entrypoint=entrypoint) @@ -188,7 +200,9 @@ class BigCodeBenchEvaluator(BaseEvaluator): while True: try: eval_client = Client(self.remote_execute_api, - httpx_kwargs=dict(proxies=proxies)) + httpx_kwargs=dict( + proxies=proxies, + timeout=httpx.Timeout(100.0))) results, pass_at_k = eval_client.predict( split=self.eval_type, samples=handle_file(submitted_contents_path), @@ -196,22 +210,25 @@ class BigCodeBenchEvaluator(BaseEvaluator): **self.eval_kwargs) break except (httpx.ReadTimeout, CancelledError): - logger.info('Read timeout error. Retrying in 4s...') + logger.info('Read timeout error. Retrying in 10s...') time.sleep(10) - if 'pass@1' in pass_at_k.keys(): - pass_at_k['pass@1'] *= 100 - dump_results = {'details': self._results_processor(results)} - dump_results.update(pass_at_k) - - return dump_results + pass_at_k = { + k: v * 100 if isinstance(v, (int, float)) else v + for k, v in pass_at_k.items() + } + return { + **pass_at_k, + 'details': self._results_processor(results), + } def _results_processor(self, results): details = [] for key, value in results['eval'].items(): - if value[0]['status'] == 'pass': - value[0]['correct'] = True - else: - value[0]['correct'] = False - details.append(value[0]) + detail = {'correct': False, 'results_details': value} + for v in value: + if v['status'] == 'pass': + detail['correct'] = True + break + details.append(detail) return details diff --git a/opencompass/datasets/custom.py b/opencompass/datasets/custom.py index b5eb8dbb..dfa8bbff 100644 --- a/opencompass/datasets/custom.py +++ b/opencompass/datasets/custom.py @@ -191,14 +191,19 @@ class CodeCustomDataset(BaseDataset): path = get_data_path(path, local_mode=local_mode) if file_name is not None: path = os.path.join(path, file_name) + files = os.listdir(path) data = [] - if path.endswith('.jsonl'): - with open(path, 'r', encoding='utf-8') as f: + if any(f.endswith('.jsonl') for f in files): + target_file = next(f for f in files if f.endswith('.jsonl')) + target_path = os.path.join(path, target_file) + with open(target_path, 'r', encoding='utf-8') as f: for line in f: data.extend( [json.loads(line.strip()) for _ in range(num_repeats)]) - elif path.endswith('.csv'): - with open(path, 'r', encoding='utf-8-sig') as f: + elif any(f.endswith('.csv') for f in files): + target_file = next(f for f in files if f.endswith('.csv')) + target_path = os.path.join(path, target_file) + with open(target_path, 'r', encoding='utf-8-sig') as f: reader = csv.reader(f) header = next(reader) for row in reader: diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index 9788b638..f56f5348 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -6,7 +6,7 @@ import os.path as osp import re import tempfile from os import environ -from typing import List +from typing import List, Sequence, Tuple, Union from datasets import Dataset @@ -70,12 +70,16 @@ class HumanevalDataset(BaseDataset): class HumanEvalEvaluator(BaseEvaluator): """Evaluator for HumanEval or EvalPlus.""" - def __init__(self, k: List[int] = [1, 10, 100]) -> None: + def __init__(self, k: Union[int, Tuple[int, ...], List[int]] = 1, + num_repeats: int = 1) -> None: try: import human_eval except ImportError: raise ImportError(HUMANEVAL_IMPORT_ERROR) + self.n = num_repeats + if not isinstance(k, Sequence): + k = (k, ) self.k = k super().__init__() @@ -87,16 +91,24 @@ class HumanEvalEvaluator(BaseEvaluator): from human_eval.evaluation import evaluate_functional_correctness prompts = [item['prompt'] for item in test_set] - humaneval_preds = [] + predictions_processed, references_processed = [], [] + for pred, refer in zip(predictions, references): + if references_processed and refer == references_processed[-1]: + predictions_processed[-1].extend([pred]) + else: + references_processed.append(refer) + predictions_processed.append([pred]) + # create json file in human_eval format - for preds, refer in zip(predictions, references): + humaneval_preds = [] + for preds_p, refer_p in zip(predictions_processed, references_processed): # suits for two case # 1. use repeated dataset # 2. use `num_return_sequences` to generate multiple responses - if not isinstance(preds, list): - preds = [preds] - for pred in preds: - humaneval_preds.append({'task_id': refer, 'completion': pred}) + if not isinstance(preds_p, list): + preds_p = [preds_p] + for pred_p in preds_p: + humaneval_preds.append({'task_id': refer_p, 'completion': pred_p}) with tempfile.TemporaryDirectory() as tmp_dir: out_dir = osp.join(tmp_dir, 'human_eval.json') write_jsonl(out_dir, humaneval_preds) @@ -183,13 +195,13 @@ def humaneval_postprocess_v2(text: str) -> str: blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) if len(blocks) >= 1: text = blocks[0] - return text + return text.lstrip() def humaneval_postprocess_v3(text: str) -> str: blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL) if len(blocks) >= 1: text = blocks[-1] - return text + return text.lstrip() def humaneval_internal_v2_postprocess(text: str): if text.startswith(' ') and not text.startswith(' '): diff --git a/opencompass/datasets/humaneval_pro.py b/opencompass/datasets/humaneval_pro.py new file mode 100644 index 00000000..310b3b41 --- /dev/null +++ b/opencompass/datasets/humaneval_pro.py @@ -0,0 +1,96 @@ +import json +from typing import Dict, List + +import numpy as np +from datasets import Dataset + +from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +class HumanevalevalProDataset(BaseDataset): + + @staticmethod + def load(path, num_repeats=1, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + dataset = [] + with open(path, encoding='utf-8') as f: + raw_data = json.load(f) + for data in raw_data: + dataset.extend([data for _ in range(num_repeats)]) + return Dataset.from_list(dataset) + + +class HumanevalProEvaluator(CodeEvaluator): + + def _process_completions(self, test_case: dict, completions: list) -> list: + processed_completions = [] + for comp in completions: + post_comp = self._extract_code(comp) + processed_completions.append(post_comp) + return processed_completions + + def score(self, predictions: List, references: List, + test_set: Dataset) -> Dict: + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + test_set = test_set.to_pandas() + # Use the first column as the unique identifier + test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) + num_repeats = int(len(test_set) / len(test_set_origin)) + + # 1. Prepare data for all test cases + all_test_cases = [] + for i in range(len(test_set_origin)): + test_case = test_set_origin.iloc[i] + completions = predictions[i * num_repeats:(i + 1) * num_repeats] + + # Process code completions + processed_completions = self._process_completions( + test_case, completions) + + sub_data_dict = { + 'name': int(test_case['id']), + 'language': self.language, + 'prompt': '', + 'tests': test_case['test_code'], + 'processed_completions': processed_completions, + 'completions': completions + } + + all_test_cases.append(sub_data_dict) + + # 2. Send all test cases to the evaluation service + success, outputs, error_message = self._evaluate(all_test_cases) + if not success: + return {'error': error_message} + + # 3. Process the returned results + details = [] + total, correct = [], [] + for output in outputs: + passed = [m['status'] == 'OK' for m in output['meta_data']] + total.append(len(passed)) + correct.append(sum(passed)) + details.append(output) + total = np.array(total) + correct = np.array(correct) + + pass_at_k = { + f'pass@{k}': + self.estimate_pass_at_k(total, correct, k).mean() * 100 + for k in self.k if (total >= k).all() + } + + return { + **pass_at_k, + 'details': details, + } diff --git a/opencompass/datasets/humaneval_pro_.py b/opencompass/datasets/humaneval_pro_.py new file mode 100644 index 00000000..290aae3d --- /dev/null +++ b/opencompass/datasets/humaneval_pro_.py @@ -0,0 +1,89 @@ +import json +import os + +import evaluate +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import HuggingfaceEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + +os.environ['HF_ALLOW_CODE_EVAL'] = '1' + + +class HumanevalevalProDataset(BaseDataset): + + @staticmethod + def load(path, num_repeats=1, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + dataset = [] + with open(path, encoding='utf-8') as f: + raw_data = json.load(f) + for data in raw_data: + dataset.extend([data for _ in range(num_repeats)]) + return Dataset.from_list(dataset) + + +class HumanevalProEvaluator(HuggingfaceEvaluator): + + def _preprocess(self, predictions, references): + predictions = [[_] for _ in predictions] + return { + 'predictions': predictions, + 'references': references, + } + + def _postprocess(self, scores): + scores = {f'humaneval_{k}': scores[k] * 100 for k in scores} + return scores + + def score(self, predictions, references, test_set): + # predictions are LLM's output; references are the 'output_column' of 'humanevalpro_reader_cfg' # noqa: E501 + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + # use codes pre-downloaded to opencompass repo, avoid downloading + current_dir = os.path.dirname(os.path.abspath(__file__)) + parrent_dir = os.path.dirname(current_dir) + local_path = os.path.join(parrent_dir, 'openicl', 'icl_evaluator', + 'hf_metrics', self.metric) + + if os.path.exists(local_path): + metric = evaluate.load(local_path) + else: + metric = evaluate.load(self.metric) + scores, _ = metric.compute(**self._preprocess(predictions, references), + k=[1, 3, 5], + num_workers=4) + result = self._postprocess(scores) + return result + + +def humanevalpro_postprocess_official(text): + """The official post-processing method for humaneval_pro, which is solely + applicable to the complete generation paradigm. + + # noqa: E501 The chat template paradigm requires a different post- + processing method. + """ + text = text[:index if (index := text.find('```')) != -1 else len(text)] + return text + + +def humanevalpro_postprocess_oc(text): + """For those generated based on the chat template paradigm, this method is + recommended. + + # noqa: E501 + """ + start = text.rfind('```python') + len('```python') + end = text.find('```', start) + + code = text[start:end].strip() + return code diff --git a/opencompass/datasets/humanevalx.py b/opencompass/datasets/humanevalx.py index 369df95c..cc51c4db 100644 --- a/opencompass/datasets/humanevalx.py +++ b/opencompass/datasets/humanevalx.py @@ -90,7 +90,7 @@ class HumanevalXEvaluator(BaseEvaluator): self.timeout = timeout super().__init__() - def score(self, predictions, references): + def score(self, predictions, references, test_set): predictions = [{ 'task_id': f'{_LANGUAGE_NAME_DICT[self.language]}/{i}', diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py index 65867d47..ac0c11b0 100644 --- a/opencompass/datasets/livecodebench/evaluator.py +++ b/opencompass/datasets/livecodebench/evaluator.py @@ -3,6 +3,7 @@ import json import multiprocessing from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Sequence import numpy as np from tqdm import tqdm @@ -174,7 +175,7 @@ def codegen_metrics( samples_list, generations_list, k_list=[1, 5, 10, 20, 40, 50, 75, 100, 125, 150, 200, 500, 1000], - num_process_evaluate=16, + num_process_evaluate=8, timeout=6, debug=False, ): @@ -238,14 +239,20 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): release_version='release_v1', extractor_version='v1', start_date=None, - end_date=None): + end_date=None, + num_repeats=1, + k=1): super().__init__() self.num_process_evaluate = num_process_evaluate self.timeout = timeout + if not isinstance(k, Sequence): + k = (k, ) + self.k = k self.dataset = LCBCodeGenerationDataset.load( release_version=release_version, start_date=start_date, - end_date=end_date)['test'] + end_date=end_date, + num_repeats=num_repeats)['test'] self.extractor_version = extractor_version def score(self, predictions, references): @@ -273,8 +280,11 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): filtered_references = [] for idx, item in enumerate(references): if item in self.dataset['question_id']: - filtered_predictions.append(predictions[idx]) - filtered_references.append(item) + if filtered_references and item == filtered_references[-1]: + filtered_predictions[-1].extend(predictions[idx]) + else: + filtered_predictions.append(predictions[idx]) + filtered_references.append(item) filtered_references = [ evaluation_samples[item] for item in filtered_references @@ -291,7 +301,7 @@ class LCBCodeGenerationEvaluator(BaseEvaluator): metrics, eval_results, final_metadata = codegen_metrics( filtered_references, filtered_predictions, - k_list=[1], + k_list=self.k, num_process_evaluate=self.num_process_evaluate, timeout=self.timeout, ) diff --git a/opencompass/datasets/livecodebench/livecodebench.py b/opencompass/datasets/livecodebench/livecodebench.py index 9ad3f84c..c56b3ac8 100644 --- a/opencompass/datasets/livecodebench/livecodebench.py +++ b/opencompass/datasets/livecodebench/livecodebench.py @@ -56,7 +56,8 @@ class LCBCodeGenerationDataset(BaseDataset): local_mode: bool = False, release_version: str = 'release_v1', start_date: str = None, - end_date: str = None): + end_date: str = None, + num_repeats: int = None): def transform(item): # Define the dataitem mapping logic @@ -118,7 +119,13 @@ class LCBCodeGenerationDataset(BaseDataset): if end_date is not None: p_end_date = datetime.strptime(end_date, '%Y-%m-%d') dataset = dataset.filter(lambda e: datetime.fromisoformat(e[ - 'contest_date']) <= p_end_date) # noqa: E501 + 'contest_date']) <= p_end_date) + + if num_repeats and num_repeats > 1: + indices = [] + for idx in range(len(dataset)): + indices.extend([idx] * num_repeats) + dataset = dataset.select(indices) return DatasetDict({'test': dataset, 'train': dataset}) diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index fca83b31..f93cf821 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -436,7 +436,7 @@ class MBPPPassKEvaluator(MBPPEvaluator): """Better use for pass k evaluation. Args: - k(Tuple[int]): Choices of Pass@k. Defaults to (1, 10, 100) + k(Union[int, Tuple[int, ...], List[int]]): Choices of Pass@k. """ def __init__(self, k=(1, 10, 100)) -> None: @@ -478,7 +478,7 @@ class MBPPPassKEvaluator(MBPPEvaluator): task_total = defaultdict(int) result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} - with ProcessPoolExecutor() as executor: + with ProcessPoolExecutor(max_workers=8) as executor: futures = [] for refer, preds in zip(references, predictions): # suits for two case @@ -494,7 +494,7 @@ class MBPPPassKEvaluator(MBPPEvaluator): for pred in preds: pred = self._process_answer(pred) programs = self._process_test(test_case, pred) - future = executor.submit(execution, programs, task_id, 10) + future = executor.submit(execution, programs, task_id, 8) futures.append(future) from tqdm import tqdm diff --git a/opencompass/datasets/mbpp_pro.py b/opencompass/datasets/mbpp_pro.py new file mode 100644 index 00000000..51a086d7 --- /dev/null +++ b/opencompass/datasets/mbpp_pro.py @@ -0,0 +1,97 @@ +import json +from typing import Dict, List + +import numpy as np +from datasets import Dataset + +from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +class MBPPProDataset(BaseDataset): + + @staticmethod + def load(path, num_repeats=1, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + print(path) + dataset = [] + with open(path, encoding='utf-8') as f: + for line in f: + dataset.extend( + [json.loads(line.strip()) for _ in range(num_repeats)]) + return Dataset.from_list(dataset) + + +class MBPPProEvaluator(CodeEvaluator): + + def _process_completions(self, test_case: dict, completions: list) -> list: + processed_completions = [] + for comp in completions: + post_comp = self._extract_code(comp) + processed_completions.append(post_comp) + return processed_completions + + def score(self, predictions: List, references: List, + test_set: Dataset) -> Dict: + if len(predictions) != len(references): + return { + 'error': + 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + test_set = test_set.to_pandas() + # Use the first column as the unique identifier + test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0]) + num_repeats = int(len(test_set) / len(test_set_origin)) + + # 1. Prepare data for all test cases + all_test_cases = [] + for i in range(len(test_set_origin)): + test_case = test_set_origin.iloc[i] + completions = predictions[i * num_repeats:(i + 1) * num_repeats] + + # Process code completions + processed_completions = self._process_completions( + test_case, completions) + + sub_data_dict = { + 'name': int(test_case['id']), + 'language': self.language, + 'prompt': '', + 'tests': test_case['test_code'], + 'processed_completions': processed_completions, + 'completions': completions + } + + all_test_cases.append(sub_data_dict) + + # 2. Send all test cases to the evaluation service + success, outputs, error_message = self._evaluate(all_test_cases) + if not success: + return {'error': error_message} + + # 3. Process the returned results + details = [] + total, correct = [], [] + for output in outputs: + passed = [m['status'] == 'OK' for m in output['meta_data']] + total.append(len(passed)) + correct.append(sum(passed)) + details.append(output) + total = np.array(total) + correct = np.array(correct) + + pass_at_k = { + f'pass@{k}': + self.estimate_pass_at_k(total, correct, k).mean() * 100 + for k in self.k if (total >= k).all() + } + + return { + **pass_at_k, + 'details': details, + } diff --git a/opencompass/openicl/icl_evaluator/code_evaluator.py b/opencompass/openicl/icl_evaluator/code_evaluator.py index d586cd6e..fde8f051 100644 --- a/opencompass/openicl/icl_evaluator/code_evaluator.py +++ b/opencompass/openicl/icl_evaluator/code_evaluator.py @@ -1,12 +1,14 @@ # flake8: noqa: E501 import difflib +import itertools import os import re import tempfile import time -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +import numpy as np from datasets import Dataset from gradio_client import Client @@ -24,19 +26,24 @@ class CodeEvaluator(BaseEvaluator): """ def __init__(self, - language: str, + language: str = 'py', ip_address: str = 'localhost', + k: Union[int, Tuple[int, ...], List[int]] = 1, retry: int = 3) -> None: """Initialize the CodeEvaluator. Args: language (str): Programming language of the code to evaluate. ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'. + k: Union[int, Tuple[int,...], List[int,...]]: The number k of pass@k to evaluate the code. Defaults to 1. retry (int, optional): Number of retry attempts for failed connections. Defaults to 3. """ self.language = language self.retry = retry self.client = Client(ip_address) + if not isinstance(k, Sequence): + k = (k, ) + self.k = k super().__init__() def _extract_code(self, text: str) -> str: @@ -195,6 +202,31 @@ class CodeEvaluator(BaseEvaluator): return True, output, None + def estimate_pass_at_k(self, num_samples: Union[int, List[int], + np.ndarray], + num_correct: Union[List[int], np.ndarray], + k: int) -> np.ndarray: + """Estimates pass@k of each problem and returns them in an array.""" + + def estimator(n: int, c: int, k: int) -> float: + """ + Calculates 1 - comb(n - c, k) / comb(n, k). + """ + if n - c < k: + return 1.0 + return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) + + if isinstance(num_samples, int): + num_samples_it = itertools.repeat(num_samples, len(num_correct)) + else: + assert len(num_samples) == len(num_correct) + num_samples_it = iter(num_samples) + + return np.array([ + estimator(int(n), int(c), k) + for n, c in zip(num_samples_it, num_correct) + ]) + def score(self, predictions: List, references: List, test_set: Dataset) -> Dict: """Score code generation predictions against references. @@ -233,7 +265,7 @@ class CodeEvaluator(BaseEvaluator): processed_completions = self._process_completions( test_case, completions) - result_dict = { + sub_data_dict = { 'name': test_case['name'], 'language': test_case['language'], 'prompt': test_case['prompt'], @@ -242,7 +274,7 @@ class CodeEvaluator(BaseEvaluator): 'completions': completions } - all_test_cases.append(result_dict) + all_test_cases.append(sub_data_dict) # 2. Send all test cases to the evaluation service success, outputs, error_message = self._evaluate(all_test_cases) @@ -251,17 +283,22 @@ class CodeEvaluator(BaseEvaluator): # 3. Process the returned results details = [] - correct = 0 + total, correct = [], [] for output in outputs: - if output.get('status') == 'OK': - output['correct'] = True - correct += 1 - else: - output['correct'] = False - + passed = [m['status'] == 'OK' for m in output['meta_data']] + total.append(len(passed)) + correct.append(sum(passed)) details.append(output) + total = np.array(total) + correct = np.array(correct) + + pass_at_k = { + f'pass@{k}': + self.estimate_pass_at_k(total, correct, k).mean() * 100 + for k in self.k if (total >= k).all() + } return { - f'pass@{num_repeats}': 100 * correct / len(test_set_origin), - 'details': details + **pass_at_k, + 'details': details, } diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 00db25e8..994a48a2 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -420,6 +420,16 @@ DATASETS_MAPPING = { "hf_id": "", "local": "./data/OlympiadBench", }, + "opencompass/humaneval_pro": { + "ms_id": "", + "hf_id": "", + "local": "./data/humaneval_pro/humaneval_pro.json", + }, + "opencompass/mbpp_pro": { + "ms_id": "", + "hf_id": "", + "local": "./data/mbpp_pro/mbpp_pro.json", + }, } DATASETS_URL = { @@ -746,5 +756,13 @@ DATASETS_URL = { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", "md5": "270f399f4142b74f47ecff116cc3b21d" - } + }, + "humaneval_pro": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip", + "md5": "4c6fe556e84e905e4f0902d699e46de5", + }, + "mbpp_pro": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip", + "md5": "eac330b8a0a8687f006265c9383503ce", + }, } diff --git a/opencompass/utils/fileio.py b/opencompass/utils/fileio.py index 098589b0..6beabf29 100644 --- a/opencompass/utils/fileio.py +++ b/opencompass/utils/fileio.py @@ -51,7 +51,8 @@ class JSONToolkit: raise @staticmethod - def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]: + def read_jsonl(file_path: Union[str, Path], + num_repeats: int = 1) -> List[Dict[str, Any]]: """Read a JSONL file and return its contents as a list of dictionaries. Args: @@ -73,7 +74,9 @@ class JSONToolkit: if not line: # Skip empty lines continue try: - results.append(json.loads(line)) + # results.append(json.loads(line)) + results.extend( + [json.loads(line) for _ in range(num_repeats)]) except json.JSONDecodeError as e: logger.error( f'Invalid JSON on line {line_num}: {str(e)}')