diff --git a/dataset-index.yml b/dataset-index.yml index 6bd2f784..ef21f4c3 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -2,7 +2,7 @@ name: IFEval category: Instruction Following paper: https://arxiv.org/pdf/2311.07911 - configpath: opencompass/configs/datasets/IFEval/IFEval + configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py configpath_llmjudge: '' - nphard: name: NPHardEval @@ -38,7 +38,7 @@ name: BigCodeBench category: Code paper: https://arxiv.org/pdf/2406.15877 - configpath: opencompass/configs/datasets/bigcodebench + configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py configpath_llmjudge: '' - calm: name: CaLM @@ -56,8 +56,8 @@ name: KOR-Bench category: Reasoning paper: https://arxiv.org/pdf/2410.06526v1 - configpath: opencompass/configs/datasets/korbench - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/korbench/korbench_gen.py + configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py - lawbench: name: LawBench category: Knowledge / Law @@ -74,7 +74,7 @@ name: LiveCodeBench category: Code paper: https://arxiv.org/pdf/2403.07974 - configpath: opencompass/configs/datasets/livecodebench + configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py configpath_llmjudge: '' - livemathbench: name: LiveMathBench @@ -104,8 +104,8 @@ name: MuSR category: Reasoning paper: https://arxiv.org/pdf/2310.16049 - configpath: opencompass/configs/datasets/musr - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/musr/musr_gen.py + configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py - needlebench: name: NeedleBench category: Long Context @@ -236,8 +236,8 @@ name: AIME2024 category: Examination paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024 - configpath: opencompass/configs/datasets/aime2024 - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py + configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py - anli: name: Adversarial NLI category: Reasoning @@ -282,8 +282,8 @@ name: BIG-Bench Hard category: Reasoning paper: https://arxiv.org/pdf/2210.09261 - configpath: opencompass/configs/datasets/bbh - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/bbh/bbh_gen.py + configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py - BoolQ: name: SuperGLUE / BoolQ category: Knowledge @@ -366,8 +366,8 @@ name: CMMLU category: Understanding paper: https://arxiv.org/pdf/2306.09212 - configpath: opencompass/configs/datasets/cmmlu - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py + configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py - cmnli: name: CLUE / CMNLI category: Reasoning @@ -432,8 +432,8 @@ name: DROP (DROP Simple Eval) category: Understanding paper: https://arxiv.org/pdf/1903.00161 - configpath: opencompass/configs/datasets/drop - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/drop/drop_gen.py + configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py - ds1000: name: DS-1000 category: Code @@ -468,8 +468,8 @@ name: GPQA category: Knowledge paper: https://arxiv.org/pdf/2311.12022v1 - configpath: opencompass/configs/datasets/gpqa - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py + configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py - gsm8k: name: GSM8K category: Math @@ -492,13 +492,13 @@ name: HellaSwag category: Reasoning paper: https://arxiv.org/pdf/1905.07830 - configpath: opencompass/configs/datasets/hellaswag - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py + configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py - humaneval: name: HumanEval category: Code paper: https://arxiv.org/pdf/2107.03374v2 - configpath: opencompass/configs/datasets/humaneval + configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py configpath_llmjudge: '' - humaneval_cn: name: HumanEval-CN @@ -566,6 +566,12 @@ paper: https://arxiv.org/pdf/2103.03874 configpath: opencompass/configs/datasets/math configpath_llmjudge: '' +- math500: + name: MATH500 + category: Math + paper: https://github.com/openai/prm800k + configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py + configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py - math401: name: MATH 401 category: Math @@ -606,8 +612,8 @@ name: MMLU category: Understanding paper: https://arxiv.org/pdf/2009.03300 - configpath: opencompass/configs/datasets/mmlu - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py + configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py - mmlu_cf: name: MMLU-CF category: Understanding @@ -618,8 +624,8 @@ name: MMLU-Pro category: Understanding paper: https://arxiv.org/pdf/2406.01574 - configpath: opencompass/configs/datasets/mmlu_pro - configpath_llmjudge: '' + configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py + configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py - mmmlu: name: MMMLU category: Language / Understanding diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py index cb48bb4e..83af4c0e 100644 --- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py @@ -1,55 +1,4 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BoolQDatasetV2 -from opencompass.utils.text_postprocessors import ( - first_option_postprocess, -) +from mmengine.config import read_base -QUERY_TEMPLATE = """ -Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering. - -Passage: {passage} - -Question: {question} - -A. Yes -B. NO - -""".strip() - -BoolQ_reader_cfg = dict( - input_columns=['question', 'passage'], - output_column='label', -) - -BoolQ_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict(role='HUMAN', prompt=QUERY_TEMPLATE), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), -) - -BoolQ_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type=first_option_postprocess, options='AB'), -) - -BoolQ_datasets = [ - dict( - abbr='BoolQ', - type=BoolQDatasetV2, - path='opencompass/boolq', - reader_cfg=BoolQ_reader_cfg, - infer_cfg=BoolQ_infer_cfg, - eval_cfg=BoolQ_eval_cfg, - ) -] \ No newline at end of file +with read_base(): + from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_llm_judge_gen.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_llm_judge_gen.py deleted file mode 100644 index cb48bb4e..00000000 --- a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_llm_judge_gen.py +++ /dev/null @@ -1,55 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BoolQDatasetV2 -from opencompass.utils.text_postprocessors import ( - first_option_postprocess, -) - -QUERY_TEMPLATE = """ -Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering. - -Passage: {passage} - -Question: {question} - -A. Yes -B. NO - -""".strip() - -BoolQ_reader_cfg = dict( - input_columns=['question', 'passage'], - output_column='label', -) - -BoolQ_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict(role='HUMAN', prompt=QUERY_TEMPLATE), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), -) - -BoolQ_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type=first_option_postprocess, options='AB'), -) - -BoolQ_datasets = [ - dict( - abbr='BoolQ', - type=BoolQDatasetV2, - path='opencompass/boolq', - reader_cfg=BoolQ_reader_cfg, - infer_cfg=BoolQ_infer_cfg, - eval_cfg=BoolQ_eval_cfg, - ) -] \ No newline at end of file diff --git a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py index e233dcd1..83e1a906 100644 --- a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py +++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py @@ -164,7 +164,7 @@ for _name in bbh_free_form_sets: name=_name, ), judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), ), pred_role='BOT', ) diff --git a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py index 58f58944..eb55bf46 100644 --- a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py @@ -94,7 +94,7 @@ for category in categories: category=category, ), judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), ), pred_role='BOT', ) diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py index d4220ddc..198b0470 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py @@ -72,7 +72,7 @@ math_eval_cfg = dict( reader_cfg=math_reader_cfg, ), judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess, metric_name='score'), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), ), pred_role='BOT', ) diff --git a/opencompass/configs/datasets/race/race_gen.py b/opencompass/configs/datasets/race/race_gen.py index 3fd646ca..535bac9a 100644 --- a/opencompass/configs/datasets/race/race_gen.py +++ b/opencompass/configs/datasets/race/race_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .race_gen_69ee4f import race_datasets # noqa: F401, F403 \ No newline at end of file + from .race_gen_69ee4f import race_datasets # noqa: F401, F403 diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index b3dca914..13c6d467 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -61,7 +61,6 @@ def _generic_llmjudge_postprocess(judgement: str): def generic_llmjudge_postprocess( output: dict, output_path: str, - metric_name='accuracy', ) -> dict: judged_answers = [] origial_responses = [] @@ -78,8 +77,8 @@ def generic_llmjudge_postprocess( get_logger().warning( f'No gold answer for {k}, use empty string as reference!') references.append('') - results = get_final_results(judged_answers, references, origial_responses, - metric_name) + results = get_final_results(judged_answers, references, origial_responses) + results['details'] = output return results diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index d3d06791..8a0da5b2 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -186,29 +186,17 @@ class DefaultSummarizer: eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) else: group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) - group_metrics.append(default_metric) - for metric in group_metrics: - for dataset_abbr in sg['subsets']: - if metric == default_metric: - metric_default = dataset_metrics[dataset_abbr][0] - scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \ - parsed_results[model_abbr][dataset_abbr][metric_default] - eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) - else: - scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \ - parsed_results[model_abbr][dataset_abbr][metric] + if need_smart_metric and len(group_metrics) > 1: + for metric in group_metrics: + for dataset_abbr in sg['subsets']: + scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) - # if need_smart_metric and len(group_metrics) > 1: - # for metric in group_metrics: - # for dataset_abbr in sg['subsets']: - # scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] - # eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) - # else: - # group_metrics = [default_metric] - # for dataset_abbr in sg['subsets']: - # metric = dataset_metrics[dataset_abbr][0] - # scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] - # eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) + else: + group_metrics = [default_metric] + for dataset_abbr in sg['subsets']: + metric = dataset_metrics[dataset_abbr][0] + scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric] + eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) result = {} for metric in scores: diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index ac463123..7c769060 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -314,8 +314,6 @@ class OpenICLEvalTask(BaseTask): 'Model Postprocess Task: ' + f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}') - # save evaluator config - # Save result out_path = get_infer_output_path( self.model_cfg,