[Feature] Update MathBench & WikiBench for FullBench (#1521)

* Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update GPQA & MMLU_Pro * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench --------- Co-authored-by: liushz <liuhongwei@pjlab.rog.cn>
2025-05-30 16:03:24 +08:00 · 2024-09-18 14:35:30 +08:00 · 2024-09-18 14:35:30 +08:00 · c9a7026f59
commit c9a7026f59
parent cfbd308edf
9 changed files with 426 additions and 2 deletions
--- a/configs/datasets/MathBench/mathbench_2024_gen_50a320.py
+++ b/configs/datasets/MathBench/mathbench_2024_gen_50a320.py
@ -0,0 +1,81 @@
 from mmengine.config import read_base
 from copy import deepcopy
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import MathBenchDataset, math_postprocess_v2
 from opencompass.utils.text_postprocessors import first_option_postprocess
 with read_base():
    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
 # Max for this dataset is 4
 num_shot = 0
 # Generate reasoning path or not, only for single choice
 with_reasoning = True
 # Use circular evaluation or not
 with_circular_eval = True
 # Use PPL mode in single choice test or not
 use_ppl_single_choice = False
 assert 0 <= num_shot <= 4
 if num_shot == 0:
    prompts = zero_shot_prompts
 else:
    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
 mathbench_datasets = []
 for _split in mathbench_sets:
    for _name in mathbench_sets[_split]:
        if 'single_choice' in _name:
            if with_reasoning:
                template_round = prompts[_name + '_with_reasoning']
            else:
                template_round = prompts[_name]
        else:
            template_round = prompts[_name]
        if 'single_choice' in _name:
            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
        else:
            pred_postprocessor = dict(type=math_postprocess_v2)
        if 'single_choice' in _name and with_circular_eval:
            evaluator = dict(type=CircularEvaluator)
        else:
            evaluator = dict(type=AccEvaluator)
        # assemble the final config
        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
            template = {}
            for answer in ['A', 'B', 'C', 'D']:
                one_template_round = deepcopy(template_round)
                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
                template[answer] = dict(round=one_template_round)
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=template),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=PPLInferencer),
            )
        else:
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=GenInferencer, max_out_len=2048),
            )
        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
        mathbench_datasets.append(
            dict(
                abbr='mathbench-' + _split + '-' + _name,
                type=MathBenchDataset,
                path=f'data/mathbench_v1/{_split}',
                name=_name,
                with_circular=with_circular_eval,
                reader_cfg=mathbench_reader_cfg,
                infer_cfg=mathbench_infer_cfg,
                eval_cfg=mathbench_eval_cfg,
            )
        )
--- a/configs/datasets/MathBench/mathbench_prompt.py
+++ b/configs/datasets/MathBench/mathbench_prompt.py
@ -11,6 +11,12 @@ zero_shot_prompts = {
    'single_choice_en': [
        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
    ],
    'cloze_en': [
        dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
    ],
    'cloze_cn': [
        dict(role='HUMAN', prompt='{question}\n请一步一步推理，并在最后用\\boxed{}给出你的答案。'),
    ]
 }
 few_shot_prompts = {
--- a/configs/datasets/wikibench/wikibench_gen_0978ad.py
+++ b/configs/datasets/wikibench/wikibench_gen_0978ad.py
@ -0,0 +1,56 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import WikiBenchDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess
 single_choice_prompts = {
    'single_choice_cn': '以下是一道单项选择题，请你根据你了解的知识一步步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。。\n下面是你要回答的题目：\n{question}\n让我们一步步推理：',
 }
 wikibench_sets = {
    'wiki': ['single_choice_cn'],
 }
 do_circular = True
 wikibench_datasets = []
 for _split in list(wikibench_sets.keys()):
    for _name in wikibench_sets[_split]:
        wikibench_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin='</E>',
                    round=[
                        dict(role='HUMAN', prompt=single_choice_prompts[_name]),
                        dict(role='BOT', prompt='{answer}'),
                    ],
                ),
                ice_token='</E>',
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer),
        )
        wikibench_eval_cfg = dict(
            evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
        )
        wikibench_datasets.append(
            dict(
                type=WikiBenchDataset,
                path=f'./data/WikiBench/{_name}.jsonl',
                name='circular_' + _name if do_circular else _name,
                abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
                reader_cfg=dict(
                    input_columns=['question'],
                    output_column='answer',
                ),
                infer_cfg=wikibench_infer_cfg,
                eval_cfg=wikibench_eval_cfg,
            )
        )
--- a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py
+++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py
@ -0,0 +1,81 @@
 from mmengine.config import read_base
 from copy import deepcopy
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import MathBenchDataset, math_postprocess_v2
 from opencompass.utils.text_postprocessors import first_option_postprocess
 with read_base():
    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
 # Max for this dataset is 4
 num_shot = 0
 # Generate reasoning path or not, only for single choice
 with_reasoning = True
 # Use circular evaluation or not
 with_circular_eval = True
 # Use PPL mode in single choice test or not
 use_ppl_single_choice = False
 assert 0 <= num_shot <= 4
 if num_shot == 0:
    prompts = zero_shot_prompts
 else:
    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
 mathbench_datasets = []
 for _split in mathbench_sets:
    for _name in mathbench_sets[_split]:
        if 'single_choice' in _name:
            if with_reasoning:
                template_round = prompts[_name + '_with_reasoning']
            else:
                template_round = prompts[_name]
        else:
            template_round = prompts[_name]
        if 'single_choice' in _name:
            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
        else:
            pred_postprocessor = dict(type=math_postprocess_v2)
        if 'single_choice' in _name and with_circular_eval:
            evaluator = dict(type=CircularEvaluator)
        else:
            evaluator = dict(type=AccEvaluator)
        # assemble the final config
        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
            template = {}
            for answer in ['A', 'B', 'C', 'D']:
                one_template_round = deepcopy(template_round)
                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
                template[answer] = dict(round=one_template_round)
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=template),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=PPLInferencer),
            )
        else:
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=GenInferencer, max_out_len=2048),
            )
        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
        mathbench_datasets.append(
            dict(
                abbr='mathbench-' + _split + '-' + _name,
                type=MathBenchDataset,
                path=f'data/mathbench_v1/{_split}',
                name=_name,
                with_circular=with_circular_eval,
                reader_cfg=mathbench_reader_cfg,
                infer_cfg=mathbench_infer_cfg,
                eval_cfg=mathbench_eval_cfg,
            )
        )
--- a/opencompass/configs/datasets/MathBench/mathbench_prompt.py
+++ b/opencompass/configs/datasets/MathBench/mathbench_prompt.py
@ -11,6 +11,12 @@ zero_shot_prompts = {
    'single_choice_en': [
        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
    ],
    'cloze_en': [
        dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
    ],
    'cloze_cn': [
        dict(role='HUMAN', prompt='{question}\n请一步一步推理，并在最后用\\boxed{}给出你的答案。'),
    ]
 }
 few_shot_prompts = {
--- a/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py
@ -0,0 +1,49 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import PPLInferencer
 from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
 from opencompass.datasets import GPQADataset, GPQAEvaluator
 from opencompass.utils import first_option_postprocess
 gpqa_reader_cfg = dict(
    input_columns=['question', 'A', 'B', 'C', 'D'],
    output_column='answer')
 hint = f'对下面的单项选择题，请直接给出正确答案的选项。'
 question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
 gpqa_infer_cfg = dict(
    ice_template=dict(
        type=PromptTemplate,
        template={
        opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
        ),
    prompt_template=dict(
        type=PromptTemplate,
        template={
        opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
        },
        ice_token='</E>'
        ),
    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
    inferencer=dict(type=PPLInferencer))
 gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
 gpqa_datasets = []
 gpqa_subsets = {
    # 'extended': 'gpqa_extended.csv',
    # 'main': 'gpqa_main.csv',
    'diamond': 'gpqa_diamond.csv'
 }
 for split in list(gpqa_subsets.keys()):
    gpqa_datasets.append(
        dict(
            abbr='GPQA_' + split,
            type=GPQADataset,
            path='./data/gpqa/',
            name=gpqa_subsets[split],
            reader_cfg=gpqa_reader_cfg,
            infer_cfg=gpqa_infer_cfg,
            eval_cfg=gpqa_eval_cfg)
    )
--- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py
@ -0,0 +1,47 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
 with read_base():
    from .mmlu_pro_categories import categories
 mmlu_pro_datasets = []
 for category in categories:
    hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
    question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
    mmlu_pro_reader_cfg = dict(
        input_columns=['question', 'cot_content', 'options_str'],
        output_column='answer_string',
        train_split='validation',
        test_split='test',
    )
    mmlu_pro_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=f'{question_and_options}\nAnswer: {{answer}}'),
        prompt_template=dict(
            type=PromptTemplate,
            template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
            ice_token='</E>'
            ),
            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
            inferencer=dict(type=GenInferencer, max_out_len=100)
    )
    mmlu_pro_eval_cfg = dict(
        evaluator=dict(type=MMLUProBaseEvaluator)
    )
    mmlu_pro_datasets.append(
        dict(
            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
            type=MMLUProDataset,
            path='opencompass/mmlu_pro',
            category=category,
            reader_cfg=mmlu_pro_reader_cfg,
            infer_cfg=mmlu_pro_infer_cfg,
            eval_cfg=mmlu_pro_eval_cfg,
        ))
--- a/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py
+++ b/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py
@ -0,0 +1,56 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import WikiBenchDataset
 from opencompass.utils.text_postprocessors import first_option_postprocess
 single_choice_prompts = {
    'single_choice_cn': '以下是一道单项选择题，请你根据你了解的知识一步步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。。\n下面是你要回答的题目：\n{question}\n让我们一步步推理：',
 }
 wikibench_sets = {
    'wiki': ['single_choice_cn'],
 }
 do_circular = True
 wikibench_datasets = []
 for _split in list(wikibench_sets.keys()):
    for _name in wikibench_sets[_split]:
        wikibench_infer_cfg = dict(
            ice_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin='</E>',
                    round=[
                        dict(role='HUMAN', prompt=single_choice_prompts[_name]),
                        dict(role='BOT', prompt='{answer}'),
                    ],
                ),
                ice_token='</E>',
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer),
        )
        wikibench_eval_cfg = dict(
            evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
        )
        wikibench_datasets.append(
            dict(
                type=WikiBenchDataset,
                path=f'./data/WikiBench/{_name}.jsonl',
                name='circular_' + _name if do_circular else _name,
                abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
                reader_cfg=dict(
                    input_columns=['question'],
                    output_column='answer',
                ),
                infer_cfg=wikibench_infer_cfg,
                eval_cfg=wikibench_eval_cfg,
            )
        )
--- a/opencompass/datasets/mmlu_pro.py
+++ b/opencompass/datasets/mmlu_pro.py
@ -3,19 +3,26 @@
 from datasets import load_dataset
 from opencompass.openicl import BaseEvaluator
 from opencompass.registry import LOAD_DATASET
 from opencompass.utils import get_data_path
 from .base import BaseDataset
 CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
 def _parse(item):
-    choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
+
    s = ''
    item['answer_string'] = ''
    for i, opt in enumerate(item['options']):
        if opt == 'N/A':
            continue
-        s += '{}. {}\n'.format(choices[i], opt)
+        option = '{}. {}\n'.format(CHOICES[i], opt)
        s += option
        if item['answer'] == CHOICES[i]:
            item['answer_string'] = option
    item['options_str'] = s.strip()
    item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
    return item
@ -31,3 +38,38 @@ class MMLUProDataset(BaseDataset):
        mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
        mmlu_pro = mmlu_pro.map(_parse)
        return mmlu_pro
 class MMLUProBaseEvaluator(BaseEvaluator):
    def is_equal(self, pred, refer):
        try:
            refer_option, refer_string = refer.split('. ')
            if pred in CHOICES and refer_option == pred:
                return True
            elif refer_string.strip() == pred:
                return True
            else :
                return False
        except Exception:
            pass
        return False
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        correct = 0
        count = 0
        details = []
        for i, j in zip(predictions, references):
            i = i.split('\n')[0].strip()
            detail = {'pred': i, 'answer': j, 'correct': False}
            count += 1
            if self.is_equal(i, j):
                correct += 1
                detail['correct'] = True
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result