diff --git a/README.md b/README.md index 1a65dfb5..6d8cabe5 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`. +You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)." +
## 🛠️ Installation diff --git a/README_zh-CN.md b/README_zh-CN.md index e2ec3f0c..21c0d666 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -77,6 +77,8 @@ 我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`. +你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py),快速地复现榜单的结果,目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。 + ## 🛠️ 安装指南 diff --git a/configs/eval_academic_leaderboard_202412.py b/configs/eval_academic_leaderboard_202412.py index 0a9e19a5..378a462c 100644 --- a/configs/eval_academic_leaderboard_202412.py +++ b/configs/eval_academic_leaderboard_202412.py @@ -10,12 +10,10 @@ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask ####################################################################### with read_base(): # Datasets Part - ## Core Set # Knowledge from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import ( mmlu_pro_datasets, ) - # General Reasoning from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import ( gpqa_datasets, @@ -23,22 +21,19 @@ with read_base(): from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import ( bbh_datasets, ) - from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import ( + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import ( humaneval_datasets, ) # Instruction Following - from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ( + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ( ifeval_datasets, ) - from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import ( + from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import ( LCBCodeGeneration_dataset, ) # Math - from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import ( - cmo_fib_datasets, - ) from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import ( aime2024_datasets, ) @@ -77,7 +72,6 @@ core_summary_groups = [ ['IFEval', 'Prompt-level-strict-accuracy'], ['bbh', 'naive_average'], ['math_prm800k_500', 'accuracy'], - ['cmo_fib', 'accuracy'], ['aime2024', 'accuracy'], ['GPQA_diamond', 'accuracy'], ['mmlu_pro', 'naive_average'], @@ -101,7 +95,6 @@ summarizer = dict( '', 'Math Calculation', ['math_prm800k_500', 'accuracy'], - ['cmo_fib', 'accuracy'], ['aime2024', 'accuracy'], '', 'Knowledge', diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py new file mode 100644 index 00000000..ceb3514f --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py index d4690bb3..6c71a60f 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py @@ -23,7 +23,7 @@ math_infer_cfg = dict( ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=1024), + inferencer=dict(type=GenInferencer), ) # postprocess v2 diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index 6f215ede..8d8e06f3 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -163,6 +163,8 @@ class BigCodeBenchEvaluator(BaseEvaluator): logger.info('Read timeout error. Retrying in 4s...') time.sleep(4) + if 'pass@1' in pass_at_k.keys(): + pass_at_k['pass@1'] *= 100 dump_results = {'details': results} dump_results.update(pass_at_k)