From 2639d113d80a20150312dda9183b9fd5a353a3fe Mon Sep 17 00:00:00 2001 From: Myhs-phz Date: Mon, 24 Mar 2025 08:27:45 +0000 Subject: [PATCH] fix --- dataset-index.yml | 6 ++++++ opencompass/configs/datasets/aime2024/aime2024_gen.py | 3 ++- opencompass/configs/datasets/bbh/bbh_gen.py | 4 ++-- opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py | 4 ++-- .../configs/datasets/bigcodebench/bigcodebench_gen.py | 6 +++--- opencompass/configs/datasets/korbench/korbench_gen.py | 2 +- .../configs/datasets/korbench/korbench_llm_judge_gen.py | 2 +- .../configs/datasets/math/math_prm800k_500_llm_judge_gen.py | 2 +- 8 files changed, 18 insertions(+), 11 deletions(-) diff --git a/dataset-index.yml b/dataset-index.yml index 3d916cd9..9c7c301a 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -313,6 +313,12 @@ paper: https://arxiv.org/pdf/2210.09261 configpath: opencompass/configs/datasets/bbh/bbh_gen.py configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py +- bbeh: + name: BIG-Bench Extra Hard + category: Reasoning + paper: https://arxiv.org/abs/2502.19187 + configpath: opencompass/configs/datasets/bbeh + configpath_llmjudge: '' - BoolQ: name: SuperGLUE / BoolQ category: Knowledge diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen.py b/opencompass/configs/datasets/aime2024/aime2024_gen.py index 45df09ab..45e9a6ee 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_gen.py +++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py @@ -2,6 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2 +from opencompass.datasets import CustomDataset aime2024_reader_cfg = dict( @@ -20,7 +21,7 @@ aime2024_infer_cfg = dict( ) ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048) + inferencer=dict(type=GenInferencer) ) aime2024_eval_cfg = dict( diff --git a/opencompass/configs/datasets/bbh/bbh_gen.py b/opencompass/configs/datasets/bbh/bbh_gen.py index b98c1149..ce93d6bb 100644 --- a/opencompass/configs/datasets/bbh/bbh_gen.py +++ b/opencompass/configs/datasets/bbh/bbh_gen.py @@ -54,7 +54,7 @@ for _name in bbh_multiple_choice_sets: ) ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer)) bbh_eval_cfg = dict( evaluator=dict(type=BBHEvaluator_mcq), pred_role='BOT', @@ -85,7 +85,7 @@ for _name in bbh_free_form_sets: ) ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer)) bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') bbh_datasets.append( diff --git a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py index 83e1a906..c846ee69 100644 --- a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py +++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py @@ -81,7 +81,7 @@ for _name in bbh_multiple_choice_sets: ) ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer)) bbh_eval_cfg = dict( evaluator=dict( type=GenericLLMEvaluator, @@ -137,7 +137,7 @@ for _name in bbh_free_form_sets: ) ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer)) bbh_eval_cfg = dict( evaluator=dict( type=GenericLLMEvaluator, diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py index 02f89ff4..cd7cff98 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py @@ -15,9 +15,9 @@ bigcodebench_hard_infer_cfg = dict(prompt_template=dict( round=[ dict(role='HUMAN', prompt='{instruct_prompt}'), ])), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, - max_out_len=8192)) + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer,) +) bigcodebench_hard_eval_cfg = dict( evaluator=dict( diff --git a/opencompass/configs/datasets/korbench/korbench_gen.py b/opencompass/configs/datasets/korbench/korbench_gen.py index 02d89928..f0a2c83c 100644 --- a/opencompass/configs/datasets/korbench/korbench_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_gen.py @@ -37,7 +37,7 @@ for category in categories: infer_cfg = dict( prompt_template=prompt_template, retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=1024), + inferencer=dict(type=GenInferencer), ) # Evaluation configuration diff --git a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py index eb55bf46..e334be03 100644 --- a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py @@ -63,7 +63,7 @@ for category in categories: infer_cfg = dict( prompt_template=prompt_template, retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=1024), + inferencer=dict(type=GenInferencer), ) # Evaluation configuration diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py index 198b0470..5c714cb2 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py @@ -17,7 +17,7 @@ math_infer_cfg = dict( ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=8192), + inferencer=dict(type=GenInferencer), ) GRADER_TEMPLATE = """