From d90833f8bc8011fdd699a828f76547c1953dd51e Mon Sep 17 00:00:00 2001 From: Dongsheng Zhu <2532956974@qq.com> Date: Thu, 15 May 2025 05:38:00 +0000 Subject: [PATCH] fix bug --- examples/eval_codebench_full.py | 154 ++++++++++++++++++ ...=> livecodebench_time_split_gen_a4f90b.py} | 0 .../configs/summarizers/groups/multipl_e.py | 6 + 3 files changed, 160 insertions(+) create mode 100644 examples/eval_codebench_full.py rename opencompass/configs/datasets/livecodebench/{livecodebench_time_split_gen.py => livecodebench_time_split_gen_a4f90b.py} (100%) create mode 100644 opencompass/configs/summarizers/groups/multipl_e.py diff --git a/examples/eval_codebench_full.py b/examples/eval_codebench_full.py new file mode 100644 index 00000000..f01549b6 --- /dev/null +++ b/examples/eval_codebench_full.py @@ -0,0 +1,154 @@ +# This config is used to test all the code benchmarks +from mmengine.config import read_base +import os.path as osp +from opencompass.runners import LocalRunner, VOLCRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets Part + # bigcodebench + from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import ( + bigcodebench_full_instruct_datasets + ) + from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import ( + bigcodebench_hard_instruct_datasets + ) + # livecodebench code generation lite v5 + from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen import ( + LCB_datasets + ) + # huamneval + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import ( + humaneval_datasets + ) + from opencompass.configs.datasets.humaneval import ( + humanevalpro_datasets + ) + from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import ( + humanevalx_datasets + ) + # mbpp + from opencompass.configs.datasets.mbpp.mbpp_gen import ( + mbpp_datasets + ) + from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import ( + mbpppro_datasets + ) + # multipl-e + from opencompass.configs.datasets.multipl_e.multiple_top_ten_gen import ( + multiple_datasets + ) + # ds1000 + from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import ( + ds1000_datasets + ) + + # Models Part + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct_model, + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import ( + models as lmdeploy_internlm3_8b_instruct_model, + ) + + # Summary Groups + from opencompass.configs.summarizers.groups.ds1000 import ( + ds1000_summary_groups, + ) + from opencompass.configs.summarizers.groups.multipl_e import ( + multiple_summary_groups, + ) + from opencompass.configs.summarizers.groups.humanevalx import ( + humanevalx_summary_groups, + ) + +# models config +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +for model in models: + model['max_seq_len'] = 16384 + model['max_out_len'] = 8192 + +# datasets config +datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), + [], +) + +for item in humanevalx_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address' + ] = 'codeeval.opencompass.org.cn/humanevalx' + item['eval_cfg']['evaluator']['port'] = '' +for item in ds1000_datasets: + item['eval_cfg']['evaluator'][ + 'ip_address' + ] = 'codeeval.opencompass.org.cn/ds1000' + item['eval_cfg']['evaluator']['port'] = '' + + +for dataset in datasets: + dataset['infer_cfg']['inferencer']['max_out_len'] = 8192 + + +# summary +summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], [] +) +summary_groups.append( + {'name': 'humanevalx', + 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']} +) +summarizer = dict( + dataset_abbrs = [ + ['bigcodebench_hard_instruct', 'pass@1'], + ['bigcodebench_full_instruct', 'pass@1'], + ['lcb_code_generation', 'pass@1'], + ['openai_humaneval', 'humaneval_pass@1'], + ['mbpp', 'score'], + ['humaneval_pro', 'pass@1'], + ['mbpp_pro', 'pass@1'], + ['multiple', 'naive_average'], + ['humanevalx', 'naive_average'], + ['ds1000', 'naive_average'], + '', + 'humanevalx-python', + 'humanevalx-cpp', + 'humanevalx-java', + 'humanevalx-js', + '', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib', + '', + 'humaneval-multiple-cpp', + 'humaneval-multiple-cs', + 'humaneval-multiple-go', + 'humaneval-multiple-java', + 'humaneval-multiple-rb', + 'humaneval-multiple-js', + 'humaneval-multiple-php', + 'humaneval-multiple-r', + 'humaneval-multiple-rs', + 'humaneval-multiple-sh', + '', + 'mbpp-multiple-cpp', + 'mbpp-multiple-cs', + 'mbpp-multiple-go', + 'mbpp-multiple-java', + 'mbpp-multiple-rb', + 'mbpp-multiple-js', + 'mbpp-multiple-php', + 'mbpp-multiple-r', + 'mbpp-multiple-rs', + 'mbpp-multiple-sh' + ], + summary_groups=summary_groups, +) + +work_dir = 'outputs/code' diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py similarity index 100% rename from opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py rename to opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py diff --git a/opencompass/configs/summarizers/groups/multipl_e.py b/opencompass/configs/summarizers/groups/multipl_e.py new file mode 100644 index 00000000..1d50c7b6 --- /dev/null +++ b/opencompass/configs/summarizers/groups/multipl_e.py @@ -0,0 +1,6 @@ +multiple_summary_groups = [] + +humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh'] +mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh'] +multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple}) +multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})