Mathbench update postprocess (#600)

* Update mathbench * Update mathbench
2025-05-30 16:03:24 +08:00 · 2023-11-20 16:48:55 +08:00 · 2023-11-20 16:48:55 +08:00 · c9c5c5d92e
commit c9c5c5d92e
parent 5e75e29711
5 changed files with 41 additions and 21 deletions
--- a/configs/datasets/MathBench/mathbench_gen.py
+++ b/configs/datasets/MathBench/mathbench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .mathbench_gen_10da90 import mathbench_datasets  # noqa: F401, F403
+    from .mathbench_gen_ad37c1 import mathbench_datasets  # noqa: F401, F403
--- a/configs/datasets/MathBench/mathbench_gen_ad37c1.py
+++ b/configs/datasets/MathBench/mathbench_gen_ad37c1.py
@ -3,17 +3,17 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_capital_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess


 single_choice_prompts = {
    "single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题，请你一步一步推理并得到最终的答案选项。回答格式为如下：\n答案选项：A、B、C、D中你认为正确的一个选项\n计算过程：根据题目得到选项答案的一步步过程\n请严格按照上面的格式回答问题，下面是你要回答的题目：\n{question}\n答案选项：",
-    "single_choice_cn": "以下是一道关于数学的单项选择题，请你给出正确的答案选项。\n下面是你要回答的题目：\n{question}\n答案选项：",
+    "single_choice_cn": "以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：",
    "single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please provide the final answer option by step-by-step reasoning. Please answer in the following format:\nAnswer option: A, B, C, or D (the option you believe is correct)\nCalculation process: Step-by-step process to derive the answer option based on the question\nPlease strictly follow the above format to answer the question. Here is the question you need to answer:\n{question}\nAnswer option:",
    "single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
 }

-cloze_prompts={
+cloze_prompts = {
    "cloze_cn": [
                dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？'),
                dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
@ -53,15 +53,13 @@ cloze_prompts={
                dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
                dict(role='HUMAN', prompt='Q: {question}'),
                dict(role='BOT', prompt='A: {answer}\n'),
-],
-}
-
+]}

 mathbench_sets = {
    'college': ['single_choice_cn', 'cloze_en'],
    'high': ['single_choice_cn', 'single_choice_en'],
    'middle': ['single_choice_cn'],
-    'primary': ['cloze_cn'],
+    'primary': ['cloze_cn']
 }

 # Generate reasoning path if set True or just generate the final answer
@ -75,10 +73,9 @@ mathbench_datasets = []
 for _split in list(mathbench_sets.keys()):
    for _name in mathbench_sets[_split]:
        mathbench_infer_cfg = dict(
-            ice_template=dict(
+            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
-                    begin="</E>",
                    round=[
                        dict(
                            role="HUMAN",
@ -86,15 +83,14 @@ for _split in list(mathbench_sets.keys()):
                        ),
                        dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
                    ),
-                ice_token="</E>",
            ),
            retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=512,),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
        )

        mathbench_eval_cfg = dict(
            evaluator=dict(type=CircularEvaluator if 'choice' in _name else AccEvaluator),
-            pred_postprocessor=dict(type=first_capital_postprocess) if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
+            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))

        mathbench_datasets.append(
            dict(
@ -110,5 +106,3 @@ for _split in list(mathbench_sets.keys()):
                infer_cfg=mathbench_infer_cfg,
                eval_cfg=mathbench_eval_cfg,
            ))
-
-del _split, _name
--- a/configs/summarizers/mathbench.py
+++ b/configs/summarizers/mathbench.py
@ -0,0 +1,18 @@
+summarizer = dict(
+    dataset_abbrs=[
+        '######## MathBench Accuracy ########', # category
+        ['mathbench-college-single_choice_cn', 'acc_1'],
+        ['mathbench-college-cloze_en', 'accuracy'],
+        ['mathbench-high-single_choice_cn', 'acc_1'],
+        ['mathbench-high-single_choice_en', 'acc_1'],
+        ['mathbench-middle-single_choice_cn', 'acc_1'],
+        ['mathbench-primary-cloze_cn', 'accuracy'],
+        '######## MathBench CircularEval ########', # category
+        ['mathbench-college-single_choice_cn', 'perf_4'],
+        ['mathbench-high-single_choice_cn', 'perf_4'],
+        ['mathbench-high-single_choice_en', 'perf_4'],
+        ['mathbench-middle-single_choice_cn', 'perf_4'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/opencompass/datasets/mathbench.py
+++ b/opencompass/datasets/mathbench.py
@ -71,10 +71,15 @@ class MathBenchDataset(BaseDataset):
                    else:
                        question = entry['question'].strip(
                        ) + '\n' + get_number(entry['options'])
-                        data.append({
+                        info = {
                            'question': question,
                            'answer': entry['answer'].strip()
-                        })
+                        }
+                        # For PPL evaluation
+                        for i in range(4):
+                            info[chr(ord('A') +
+                                     i)] = entry['options'][i].strip()
+                        data.append(info)

        dataset = Dataset.from_list(data)
        return dataset
@ -91,7 +96,7 @@ def mathbench_postprocess(text: str, name: str) -> str:
        ans = ans_line[1].strip()

    output = re.sub(r'(\d),(\d)', r'\1\2', ans)
-    numbers = re.findall(r'-?\d*\.?\d+|\d+', output)
+    numbers = re.findall(r'-?\d*\.?/?\d+|\d+', output)
    if numbers:
        return numbers[-1]

--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -53,9 +53,12 @@ def first_option_postprocess(text: str, options: str) -> str:

    patterns = [
        f'[Tt]he answer is [{options}]',
-        f'[Tt]he correct answer is [{options}]',
-        f'答案是(.*?)[{options}]',
-        f'答案为(.*?)[{options}]',
+        f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]',  # noqa
+        f'答案(?:选项)?是(.*?)[{options}]',
+        f'答案(?:选项)?为(.*?)[{options}]',
+        f'答案(?:选项)?选(.*?)[{options}]',
+        f'选项[{options}]是?正确',
+        f'选项[{options}]为?正确',
        f'固选(.*?)[{options}]',
        f'答案应该是(.*?)[{options}]',
        f'(\s|^)[{options}][\s。，,\.$]',  # noqa