diff --git a/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py index af8e4cd8..6e8fad15 100644 --- a/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py +++ b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py @@ -69,7 +69,7 @@ for _split in mathbench_sets: mathbench_datasets.append( dict( - abbr='mathbench-' + _split + '-' + _name, + abbr='mathbench-no_cot-' + _split + '-' + _name, type=MathBenchDataset, path=f'data/mathbench_v1/{_split}', name=_name, diff --git a/configs/datasets/MathBench/mathbench_2024_gen_fc2a24.py b/configs/datasets/MathBench/mathbench_2024_gen_fc2a24.py new file mode 100644 index 00000000..3c0b77d8 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_gen_fc2a24.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py index 90e8e5fe..d2a361da 100644 --- a/configs/datasets/MathBench/mathbench_gen.py +++ b/configs/datasets/MathBench/mathbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403 + from .mathbench_2024_gen_fc2a24 import mathbench_datasets # noqa: F401, F403 diff --git a/configs/eval_mathbench.py b/configs/eval_mathbench.py new file mode 100644 index 00000000..bc4569cf --- /dev/null +++ b/configs/eval_mathbench.py @@ -0,0 +1,42 @@ +from mmengine.config import read_base + +with read_base(): + + # Import models + from .models.hf_llama.hf_llama3_8b_instruct import models as llama3_8b_instruct_model + from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b_model + + # Import datasets + from .datasets.MathBench.mathbench_gen import mathbench_datasets + + # Import summarizers for display results + from .summarizers.groups.mathbench_v1_2024 import summarizer # Grouped results for MathBench-A and MathBench-T separately + # from .summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset + # from .summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +eval = dict( + partitioner=dict(type=NaivePartitioner, n=8), + runner=dict( + type=LocalRunner, + max_num_workers=256, + task=dict(type=OpenICLEvalTask) + ), +) + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=4), + runner=dict( + type=LocalRunner, + max_num_workers=256, + task=dict(type=OpenICLInferTask) + ), +) + +work_dir = './outputs/mathbench_results' diff --git a/configs/summarizers/groups/mathbench_v1_2024.py b/configs/summarizers/groups/mathbench_v1_2024.py new file mode 100644 index 00000000..c34df7d7 --- /dev/null +++ b/configs/summarizers/groups/mathbench_v1_2024.py @@ -0,0 +1,44 @@ + +mathbench_2024_summary_groups = [ + {'name': 'college', 'subsets': [['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4']]}, + {'name': 'high', 'subsets': [['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4']]}, + {'name': 'middle', 'subsets': [['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4']]}, + {'name': 'primary', 'subsets': [['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy']]}, + {'name': 'arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]}, + {'name': 'mathbench-a-cn', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-high-single_choice_cn', 'mathbench-middle-single_choice_cn', 'mathbench-primary-cloze_cn']}, + {'name': 'mathbench-a-en', 'subsets': ['mathbench-college-single_choice_en', 'mathbench-high-single_choice_en', 'mathbench-middle-single_choice_en', 'mathbench-primary-cloze_en']}, + {'name': 'mathbench-a (average)', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic']}, + + {'name': 'college_knowledge', 'subsets': [['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'high_knowledge', 'subsets': [['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'middle_knowledge', 'subsets': [['mathbench-middle_knowledge-single_choice_cn', 'perf_4'], ['mathbench-middle_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'primary_knowledge', 'subsets': [['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'mathbench-t-cn', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_cn']}, + {'name': 'mathbench-t-en', 'subsets': ['mathbench-college_knowledge-single_choice_en', 'mathbench-high_knowledge-single_choice_en', 'mathbench-middle_knowledge-single_choice_en', 'mathbench-primary_knowledge-single_choice_en']}, + {'name': 'mathbench-t (average)', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']}, + + {'name': 'Overall', 'subsets': ['mathbench-a (average)', 'mathbench-t (average)']}, +] + +summarizer = dict( + dataset_abbrs = [ + '###### MathBench-A: Application Part ######', + 'college', + 'high', + 'middle', + 'primary', + 'arithmetic', + 'mathbench-a (average)', + + '###### MathBench-T: Theory Part ######', + 'college_knowledge', + 'high_knowledge', + 'middle_knowledge', + 'primary_knowledge', + 'mathbench-t (average)', + + '###### Overall: Average between MathBench-A and MathBench-T ######', + 'Overall', + ], + summary_groups=mathbench_2024_summary_groups, +) diff --git a/configs/summarizers/groups/mathbench_v1_2024_lang.py b/configs/summarizers/groups/mathbench_v1_2024_lang.py new file mode 100644 index 00000000..3693f7f4 --- /dev/null +++ b/configs/summarizers/groups/mathbench_v1_2024_lang.py @@ -0,0 +1,57 @@ + +mathbench_2024_summary_groups = [ + {'name': 'college', 'subsets': [['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4']]}, + {'name': 'high', 'subsets': [['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4']]}, + {'name': 'middle', 'subsets': [['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4']]}, + {'name': 'primary', 'subsets': [['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy']]}, + {'name': 'arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]}, + {'name': 'mathbench-a-cn-average', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-high-single_choice_cn', 'mathbench-middle-single_choice_cn', 'mathbench-primary-cloze_cn']}, + {'name': 'mathbench-a-en-average', 'subsets': ['mathbench-college-single_choice_en', 'mathbench-high-single_choice_en', 'mathbench-middle-single_choice_en', 'mathbench-primary-cloze_en']}, + {'name': 'mathbench-a (average)', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic']}, + + {'name': 'college_knowledge', 'subsets': [['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'high_knowledge', 'subsets': [['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'middle_knowledge', 'subsets': [['mathbench-middle_knowledge-single_choice_cn', 'perf_4'], ['mathbench-middle_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'primary_knowledge', 'subsets': [['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4']]}, + {'name': 'mathbench-t-cn-average', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_cn']}, + {'name': 'mathbench-t-en-average', 'subsets': ['mathbench-college_knowledge-single_choice_en', 'mathbench-high_knowledge-single_choice_en', 'mathbench-middle_knowledge-single_choice_en', 'mathbench-primary_knowledge-single_choice_en']}, + {'name': 'mathbench-t (average)', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']}, + + {'name': 'Overall', 'subsets': ['mathbench-a (average)', 'mathbench-t (average)']}, +] + + +summarizer = dict( + dataset_abbrs = [ + '########################################################', + '###### MathBench-A-CN: Application Part (Chinese) ######', + 'mathbench-college-single_choice_cn', + 'mathbench-high-single_choice_cn', + 'mathbench-middle-single_choice_cn', + 'mathbench-primary-cloze_cn', + 'mathbench-a-cn-average', + + '###### MathBench-A-EN: Application Part (English) ######', + 'mathbench-college-single_choice_en', + 'mathbench-high-single_choice_en', + 'mathbench-middle-single_choice_en', + 'mathbench-primary-cloze_en', + 'mathbench-a-en-average', + + '###################################################', + '###### MathBench-T-CN: Theory Part (Chinese) ######', + 'mathbench-college_knowledge-single_choice_cn', + 'mathbench-high_knowledge-single_choice_cn', + 'mathbench-middle_knowledge-single_choice_cn', + 'mathbench-primary_knowledge-single_choice_cn', + 'mathbench-t-cn-average', + + '###### MathBench-T-EN: Theory Part (English) ######', + 'mathbench-college_knowledge-single_choice_en', + 'mathbench-high_knowledge-single_choice_en', + 'mathbench-middle_knowledge-single_choice_en', + 'mathbench-primary_knowledge-single_choice_en', + 'mathbench-t-en-average', + ], + summary_groups=mathbench_2024_summary_groups, +)