diff --git a/examples/eval_codebench.py b/examples/eval_codebench.py index 75f19a24..fdc4aa20 100644 --- a/examples/eval_codebench.py +++ b/examples/eval_codebench.py @@ -150,5 +150,4 @@ summarizer = dict( summary_groups=summary_groups, ) -# work_dir = 'outputs/code' -work_dir = 'outputs/code_2' +work_dir = 'outputs/code' diff --git a/examples/eval_codebench_passk.py b/examples/eval_codebench_passk.py index d9015a8d..0ffd3e6f 100644 --- a/examples/eval_codebench_passk.py +++ b/examples/eval_codebench_passk.py @@ -64,6 +64,98 @@ for dataset in datasets: dataset['eval_cfg']['evaluator']['num_repeats'] = num_repeats dataset['eval_cfg']['evaluator']['k'] = k dataset['num_repeats'] = num_repeats - dataset['abbr'] += f'_passk' + # dataset['abbr'] += f'_passk' + +# summary +summarizer = dict( + dataset_abbrs = [ + 'pass@1', + ['bigcodebench_full_instruct_passk', 'pass@1'], + ['bigcodebench_hard_instruct_passk', 'pass@1'], + ['lcb_code_generation_passk', 'pass@1'], + ['openai_humaneval_passk_passk', 'humaneval_pass@1'], + ['humaneval_pro_passk', 'pass@1'], + ['mbpp_passk_passk', 'pass@1'], + ['mbpp_pro_passk', 'pass@1'], + ['humaneval-multiple-cpp_passk', 'pass@1'], + ['humaneval-multiple-cs_passk', 'pass@1'], + ['humaneval-multiple-go_passk', 'pass@1'], + ['humaneval-multiple-java_passk', 'pass@1'], + ['humaneval-multiple-rb_passk', 'pass@1'], + ['humaneval-multiple-js_passk', 'pass@1'], + ['humaneval-multiple-php_passk', 'pass@1'], + ['humaneval-multiple-r_passk', 'pass@1'], + ['humaneval-multiple-rs_passk', 'pass@1'], + ['humaneval-multiple-sh_passk', 'pass@1'], + ['mbpp-multiple-cpp_passk', 'pass@1'], + ['mbpp-multiple-cs_passk', 'pass@1'], + ['mbpp-multiple-go_passk', 'pass@1'], + ['mbpp-multiple-java_passk', 'pass@1'], + ['mbpp-multiple-rb_passk', 'pass@1'], + ['mbpp-multiple-js_passk', 'pass@1'], + ['mbpp-multiple-php_passk', 'pass@1'], + ['mbpp-multiple-r_passk', 'pass@1'], + ['mbpp-multiple-rs_passk', 'pass@1'], + ['mbpp-multiple-sh_passk', 'pass@1'], + '', + 'pass@3', + ['bigcodebench_full_instruct_passk', 'pass@3'], + ['bigcodebench_hard_instruct_passk', 'pass@3'], + ['lcb_code_generation_passk', 'pass@3'], + ['openai_humaneval_passk_passk', 'humaneval_pass@3'], + ['humaneval_pro_passk', 'pass@3'], + ['mbpp_passk_passk', 'pass@3'], + ['mbpp_pro_passk', 'pass@3'], + ['humaneval-multiple-cpp_passk', 'pass@3'], + ['humaneval-multiple-cs_passk', 'pass@3'], + ['humaneval-multiple-go_passk', 'pass@3'], + ['humaneval-multiple-java_passk', 'pass@3'], + ['humaneval-multiple-rb_passk', 'pass@3'], + ['humaneval-multiple-js_passk', 'pass@3'], + ['humaneval-multiple-php_passk', 'pass@3'], + ['humaneval-multiple-r_passk', 'pass@3'], + ['humaneval-multiple-rs_passk', 'pass@3'], + ['humaneval-multiple-sh_passk', 'pass@3'], + ['mbpp-multiple-cpp_passk', 'pass@3'], + ['mbpp-multiple-cs_passk', 'pass@3'], + ['mbpp-multiple-go_passk', 'pass@3'], + ['mbpp-multiple-java_passk', 'pass@3'], + ['mbpp-multiple-rb_passk', 'pass@3'], + ['mbpp-multiple-js_passk', 'pass@3'], + ['mbpp-multiple-php_passk', 'pass@3'], + ['mbpp-multiple-r_passk', 'pass@3'], + ['mbpp-multiple-rs_passk', 'pass@3'], + ['mbpp-multiple-sh_passk', 'pass@3'], + '', + 'pass@5', + ['bigcodebench_full_instruct_passk', 'pass@5'], + ['bigcodebench_hard_instruct_passk', 'pass@5'], + ['lcb_code_generation_passk', 'pass@5'], + ['openai_humaneval_passk_passk', 'humaneval_pass@5'], + ['humaneval_pro_passk', 'pass@5'], + ['mbpp_passk_passk', 'pass@5'], + ['mbpp_pro_passk', 'pass@5'], + ['humaneval-multiple-cpp_passk', 'pass@5'], + ['humaneval-multiple-cs_passk', 'pass@5'], + ['humaneval-multiple-go_passk', 'pass@5'], + ['humaneval-multiple-java_passk', 'pass@5'], + ['humaneval-multiple-rb_passk', 'pass@5'], + ['humaneval-multiple-js_passk', 'pass@5'], + ['humaneval-multiple-php_passk', 'pass@5'], + ['humaneval-multiple-r_passk', 'pass@5'], + ['humaneval-multiple-rs_passk', 'pass@5'], + ['humaneval-multiple-sh_passk', 'pass@5'], + ['mbpp-multiple-cpp_passk', 'pass@5'], + ['mbpp-multiple-cs_passk', 'pass@5'], + ['mbpp-multiple-go_passk', 'pass@5'], + ['mbpp-multiple-java_passk', 'pass@5'], + ['mbpp-multiple-rb_passk', 'pass@5'], + ['mbpp-multiple-js_passk', 'pass@5'], + ['mbpp-multiple-php_passk', 'pass@5'], + ['mbpp-multiple-r_passk', 'pass@5'], + ['mbpp-multiple-rs_passk', 'pass@5'], + ['mbpp-multiple-sh_passk', 'pass@5'], + ], +) work_dir = 'outputs/code_passk'