mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Update MathBench summarizer & fix cot setting (#1282)
* Update MathBench * Update MathBench * Update MathBench --------- Co-authored-by: liushz <liuhongwei@pjlab.rog.cn>
This commit is contained in:
parent
a32f21a356
commit
fc2c9dea8c
@ -69,7 +69,7 @@ for _split in mathbench_sets:
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
abbr='mathbench-no_cot-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
|
81
configs/datasets/MathBench/mathbench_2024_gen_fc2a24.py
Normal file
81
configs/datasets/MathBench/mathbench_2024_gen_fc2a24.py
Normal file
@ -0,0 +1,81 @@
|
||||
from mmengine.config import read_base
|
||||
from copy import deepcopy
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
with read_base():
|
||||
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
|
||||
|
||||
# Max for this dataset is 4
|
||||
num_shot = 4
|
||||
# Generate reasoning path or not, only for single choice
|
||||
with_reasoning = True
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
# Use PPL mode in single choice test or not
|
||||
use_ppl_single_choice = False
|
||||
|
||||
assert 0 <= num_shot <= 4
|
||||
if num_shot == 0:
|
||||
prompts = zero_shot_prompts
|
||||
else:
|
||||
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
|
||||
|
||||
mathbench_datasets = []
|
||||
for _split in mathbench_sets:
|
||||
for _name in mathbench_sets[_split]:
|
||||
if 'single_choice' in _name:
|
||||
if with_reasoning:
|
||||
template_round = prompts[_name + '_with_reasoning']
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
else:
|
||||
template_round = prompts[_name]
|
||||
|
||||
if 'single_choice' in _name:
|
||||
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
|
||||
else:
|
||||
pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
|
||||
|
||||
if 'single_choice' in _name and with_circular_eval:
|
||||
evaluator = dict(type=CircularEvaluator)
|
||||
else:
|
||||
evaluator = dict(type=AccEvaluator)
|
||||
|
||||
# assemble the final config
|
||||
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = deepcopy(template_round)
|
||||
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
else:
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
)
|
||||
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr='mathbench-' + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f'data/mathbench_v1/{_split}',
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
reader_cfg=mathbench_reader_cfg,
|
||||
infer_cfg=mathbench_infer_cfg,
|
||||
eval_cfg=mathbench_eval_cfg,
|
||||
)
|
||||
)
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403
|
||||
from .mathbench_2024_gen_fc2a24 import mathbench_datasets # noqa: F401, F403
|
||||
|
42
configs/eval_mathbench.py
Normal file
42
configs/eval_mathbench.py
Normal file
@ -0,0 +1,42 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
|
||||
# Import models
|
||||
from .models.hf_llama.hf_llama3_8b_instruct import models as llama3_8b_instruct_model
|
||||
from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b_model
|
||||
|
||||
# Import datasets
|
||||
from .datasets.MathBench.mathbench_gen import mathbench_datasets
|
||||
|
||||
# Import summarizers for display results
|
||||
from .summarizers.groups.mathbench_v1_2024 import summarizer # Grouped results for MathBench-A and MathBench-T separately
|
||||
# from .summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset
|
||||
# from .summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results
|
||||
|
||||
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLEvalTask)
|
||||
),
|
||||
)
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
work_dir = './outputs/mathbench_results'
|
44
configs/summarizers/groups/mathbench_v1_2024.py
Normal file
44
configs/summarizers/groups/mathbench_v1_2024.py
Normal file
@ -0,0 +1,44 @@
|
||||
|
||||
mathbench_2024_summary_groups = [
|
||||
{'name': 'college', 'subsets': [['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4']]},
|
||||
{'name': 'high', 'subsets': [['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4']]},
|
||||
{'name': 'middle', 'subsets': [['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4']]},
|
||||
{'name': 'primary', 'subsets': [['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy']]},
|
||||
{'name': 'arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]},
|
||||
{'name': 'mathbench-a-cn', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-high-single_choice_cn', 'mathbench-middle-single_choice_cn', 'mathbench-primary-cloze_cn']},
|
||||
{'name': 'mathbench-a-en', 'subsets': ['mathbench-college-single_choice_en', 'mathbench-high-single_choice_en', 'mathbench-middle-single_choice_en', 'mathbench-primary-cloze_en']},
|
||||
{'name': 'mathbench-a (average)', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic']},
|
||||
|
||||
{'name': 'college_knowledge', 'subsets': [['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'high_knowledge', 'subsets': [['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'middle_knowledge', 'subsets': [['mathbench-middle_knowledge-single_choice_cn', 'perf_4'], ['mathbench-middle_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'primary_knowledge', 'subsets': [['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'mathbench-t-cn', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_cn']},
|
||||
{'name': 'mathbench-t-en', 'subsets': ['mathbench-college_knowledge-single_choice_en', 'mathbench-high_knowledge-single_choice_en', 'mathbench-middle_knowledge-single_choice_en', 'mathbench-primary_knowledge-single_choice_en']},
|
||||
{'name': 'mathbench-t (average)', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
|
||||
|
||||
{'name': 'Overall', 'subsets': ['mathbench-a (average)', 'mathbench-t (average)']},
|
||||
]
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
'###### MathBench-A: Application Part ######',
|
||||
'college',
|
||||
'high',
|
||||
'middle',
|
||||
'primary',
|
||||
'arithmetic',
|
||||
'mathbench-a (average)',
|
||||
|
||||
'###### MathBench-T: Theory Part ######',
|
||||
'college_knowledge',
|
||||
'high_knowledge',
|
||||
'middle_knowledge',
|
||||
'primary_knowledge',
|
||||
'mathbench-t (average)',
|
||||
|
||||
'###### Overall: Average between MathBench-A and MathBench-T ######',
|
||||
'Overall',
|
||||
],
|
||||
summary_groups=mathbench_2024_summary_groups,
|
||||
)
|
57
configs/summarizers/groups/mathbench_v1_2024_lang.py
Normal file
57
configs/summarizers/groups/mathbench_v1_2024_lang.py
Normal file
@ -0,0 +1,57 @@
|
||||
|
||||
mathbench_2024_summary_groups = [
|
||||
{'name': 'college', 'subsets': [['mathbench-college-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4']]},
|
||||
{'name': 'high', 'subsets': [['mathbench-high-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4']]},
|
||||
{'name': 'middle', 'subsets': [['mathbench-middle-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4']]},
|
||||
{'name': 'primary', 'subsets': [['mathbench-primary-cloze_cn', 'accuracy'], ['mathbench-primary-cloze_en', 'accuracy']]},
|
||||
{'name': 'arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]},
|
||||
{'name': 'mathbench-a-cn-average', 'subsets': ['mathbench-college-single_choice_cn', 'mathbench-high-single_choice_cn', 'mathbench-middle-single_choice_cn', 'mathbench-primary-cloze_cn']},
|
||||
{'name': 'mathbench-a-en-average', 'subsets': ['mathbench-college-single_choice_en', 'mathbench-high-single_choice_en', 'mathbench-middle-single_choice_en', 'mathbench-primary-cloze_en']},
|
||||
{'name': 'mathbench-a (average)', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic']},
|
||||
|
||||
{'name': 'college_knowledge', 'subsets': [['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'high_knowledge', 'subsets': [['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'middle_knowledge', 'subsets': [['mathbench-middle_knowledge-single_choice_cn', 'perf_4'], ['mathbench-middle_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'primary_knowledge', 'subsets': [['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary_knowledge-single_choice_en', 'perf_4']]},
|
||||
{'name': 'mathbench-t-cn-average', 'subsets': ['mathbench-college_knowledge-single_choice_cn', 'mathbench-high_knowledge-single_choice_cn', 'mathbench-middle_knowledge-single_choice_cn', 'mathbench-primary_knowledge-single_choice_cn']},
|
||||
{'name': 'mathbench-t-en-average', 'subsets': ['mathbench-college_knowledge-single_choice_en', 'mathbench-high_knowledge-single_choice_en', 'mathbench-middle_knowledge-single_choice_en', 'mathbench-primary_knowledge-single_choice_en']},
|
||||
{'name': 'mathbench-t (average)', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
|
||||
|
||||
{'name': 'Overall', 'subsets': ['mathbench-a (average)', 'mathbench-t (average)']},
|
||||
]
|
||||
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
'########################################################',
|
||||
'###### MathBench-A-CN: Application Part (Chinese) ######',
|
||||
'mathbench-college-single_choice_cn',
|
||||
'mathbench-high-single_choice_cn',
|
||||
'mathbench-middle-single_choice_cn',
|
||||
'mathbench-primary-cloze_cn',
|
||||
'mathbench-a-cn-average',
|
||||
|
||||
'###### MathBench-A-EN: Application Part (English) ######',
|
||||
'mathbench-college-single_choice_en',
|
||||
'mathbench-high-single_choice_en',
|
||||
'mathbench-middle-single_choice_en',
|
||||
'mathbench-primary-cloze_en',
|
||||
'mathbench-a-en-average',
|
||||
|
||||
'###################################################',
|
||||
'###### MathBench-T-CN: Theory Part (Chinese) ######',
|
||||
'mathbench-college_knowledge-single_choice_cn',
|
||||
'mathbench-high_knowledge-single_choice_cn',
|
||||
'mathbench-middle_knowledge-single_choice_cn',
|
||||
'mathbench-primary_knowledge-single_choice_cn',
|
||||
'mathbench-t-cn-average',
|
||||
|
||||
'###### MathBench-T-EN: Theory Part (English) ######',
|
||||
'mathbench-college_knowledge-single_choice_en',
|
||||
'mathbench-high_knowledge-single_choice_en',
|
||||
'mathbench-middle_knowledge-single_choice_en',
|
||||
'mathbench-primary_knowledge-single_choice_en',
|
||||
'mathbench-t-en-average',
|
||||
],
|
||||
summary_groups=mathbench_2024_summary_groups,
|
||||
)
|
Loading…
Reference in New Issue
Block a user