From bc2969dba8b97e8caef54c5ae98d02a4af2f17b5 Mon Sep 17 00:00:00 2001
From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com>
Date: Wed, 12 Mar 2025 10:53:31 +0800
Subject: [PATCH] [Feature] Add support for BBEH dataset (#1925)

* bbeh

* bbeh

* fix_smallbugs_bbeh

* removeprint

* results

---------

Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
---
 dataset-index.yml                             |   5 +
 opencompass/configs/datasets/bbeh/README.md   |  26 +++
 opencompass/configs/datasets/bbeh/bbeh_gen.py |  93 +++++++++++
 .../configs/summarizers/groups/bbeh.py        |  12 ++
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/bbeh.py                  | 149 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  10 ++
 7 files changed, 296 insertions(+)
 create mode 100644 opencompass/configs/datasets/bbeh/README.md
 create mode 100644 opencompass/configs/datasets/bbeh/bbeh_gen.py
 create mode 100644 opencompass/configs/summarizers/groups/bbeh.py
 create mode 100644 opencompass/datasets/bbeh.py

diff --git a/dataset-index.yml b/dataset-index.yml
index f72e7362..e998f65f 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -234,6 +234,11 @@
     category: Reasoning
     paper: https://arxiv.org/pdf/2210.09261
     configpath: opencompass/configs/datasets/bbh
+- bbeh:
+    name: BIG-Bench Extra Hard
+    category: Reasoning
+    paper: https://arxiv.org/abs/2502.19187
+    configpath: opencompass/configs/datasets/bbeh
 - BoolQ:
     name: SuperGLUE / BoolQ
     category: Knowledge
diff --git a/opencompass/configs/datasets/bbeh/README.md b/opencompass/configs/datasets/bbeh/README.md
new file mode 100644
index 00000000..1fd034ff
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/README.md
@@ -0,0 +1,26 @@
+# BB#H
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
+python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
+```
+
+## Models
+
+|                   model                    | score |
+|:-----------------------------------------:|------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
+
+### Details
+
+|                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
+|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
+
+|                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
+|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
+
+|                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
+|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
diff --git a/opencompass/configs/datasets/bbeh/bbeh_gen.py b/opencompass/configs/datasets/bbeh/bbeh_gen.py
new file mode 100644
index 00000000..522ade24
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py
@@ -0,0 +1,93 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
+
+bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+
+bbeh_multiple_choice_sets = [
+    'bbeh_boolean_expressions',
+    'bbeh_disambiguation_qa',
+    'bbeh_geometric_shapes',
+    'bbeh_hyperbaton',
+    'bbeh_movie_recommendation',
+    'bbeh_nycc',
+    'bbeh_shuffled_objects',
+]
+
+bbeh_free_form_sets = [
+    'bbeh_boardgame_qa',
+    'bbeh_buggy_tables',
+    'bbeh_causal_understanding',
+    'bbeh_dyck_languages',
+    'bbeh_linguini',
+    'bbeh_multistep_arithmetic',
+    'bbeh_object_counting',
+    'bbeh_object_properties',
+    'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning',
+    'bbeh_sportqa',
+    'bbeh_temporal_sequence',
+    'bbeh_time_arithmetic',
+    'bbeh_web_of_lies',
+    'bbeh_word_sorting',
+    'bbeh_zebra_puzzles',
+]
+
+bbeh_datasets = []
+for _name in bbeh_multiple_choice_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(
+        evaluator=dict(type=BBEHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbeh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
+
+for _name in bbeh_free_form_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
\ No newline at end of file
diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py
new file mode 100644
index 00000000..5e5cc222
--- /dev/null
+++ b/opencompass/configs/summarizers/groups/bbeh.py
@@ -0,0 +1,12 @@
+bbeh_summary_groups = []
+
+# bbeh
+_bbeh = [
+    'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton',
+    'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa',
+    'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini',
+    'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
+    'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
+]
+bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index ffcc217d..6d135f61 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -9,6 +9,7 @@ from .arc import *  # noqa: F401, F403
 from .arc_prize_public_evaluation import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
 from .babilong import *  # noqa: F401, F403
+from .bbeh import *  # noqa: F401, F403
 from .bbh import *  # noqa: F401, F403
 from .bigcodebench import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
diff --git a/opencompass/datasets/bbeh.py b/opencompass/datasets/bbeh.py
new file mode 100644
index 00000000..0b3a49a7
--- /dev/null
+++ b/opencompass/datasets/bbeh.py
@@ -0,0 +1,149 @@
+import json
+import os.path as osp
+import re
+from os import environ
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class BBEHDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        path = get_data_path(path)
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            from modelscope import MsDataset
+            dataset = MsDataset.load(path, subset_name=name, split='test')
+        else:
+            with open(osp.join(path, f'{name}/task.json'), 'r') as f:
+                data = json.load(f)['examples']
+            dataset = Dataset.from_list(data)
+        return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module('bbeh_freeform')
+def bbeh_freeform_postprocess(text: str) -> str:
+    # Extract answer using specified prefixes
+    prefixes = [
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
+        'The final answer is '
+    ]
+    answer = text
+    for prefix in prefixes:
+        if prefix in text:
+            answer = text.split(prefix)[-1]
+            break
+
+    # Remove formatting markup
+    if '\\boxed' in answer:
+        answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer)  # latex box
+    if '\\text' in answer:
+        answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer)  # text/texttt
+    if '**' in answer:
+        answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer)  # bold
+
+    # Take first line and clean
+    if '\n' in answer:
+        answer = answer.split('\n')[0].strip()
+
+    return answer.strip().lower()
+
+
+@TEXT_POSTPROCESSORS.register_module('bbeh_mcq')
+def bbeh_mcq_postprocess(text: str) -> str:
+    # Extract answer using specified prefixes
+    prefixes = [
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
+        'The final answer is '
+    ]
+    answer = text
+    for prefix in prefixes:
+        if prefix in text:
+            answer = text.split(prefix)[-1]
+            break
+
+    # Remove parentheses if present
+    answer = answer.strip('()')
+
+    # Take first line and clean
+    if '\n' in answer:
+        answer = answer.split('\n')[0].strip()
+
+    return answer.strip().lower()
+
+
+@ICL_EVALUATORS.register_module()
+class BBEHEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different length'
+            }
+
+        processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
+        # References are already in correct format
+        processed_refs = [r.lower() for r in references]
+
+        details = []
+        correct_count = 0
+
+        for pred, ref in zip(processed_preds, processed_refs):
+            correct = False
+
+            # Rule 1: Exact match
+            if pred == ref:
+                correct = True
+            # Rule 2: Match after removing quotes/brackets
+            elif pred == ref.strip("'\"()[]"):
+                correct = True
+            # Rule 4: Comma - separated answers
+            elif ',' in ref:
+                norm_pred = re.sub(r'\s*,\s*', ',', pred)
+                norm_ref = re.sub(r'\s*,\s*', ',', ref)
+                if norm_pred == norm_ref:
+                    correct = True
+
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
+            correct_count += int(correct)
+
+        score = (correct_count / len(predictions)) * 100
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class BBEHEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different length'
+            }
+
+        processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
+        # References are already in correct format
+        processed_refs = [r.lower().strip('()') for r in references]
+
+        details = []
+        correct_count = 0
+
+        for pred, ref in zip(processed_preds, processed_refs):
+            correct = False
+
+            # Rule 1: Exact match
+            if pred == ref:
+                correct = True
+
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
+            correct_count += int(correct)
+
+        score = (correct_count / len(predictions)) * 100
+        return {'score': score, 'details': details}
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 79be5736..25c877c6 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -33,6 +33,12 @@ DATASETS_MAPPING = {
         "hf_id": "opencompass/bbh",
         "local": "./data/BBH/data",
     },
+    # bbeh
+    "opencompass/bbeh": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/bbeh/",
+    },
     # C-Eval
     "opencompass/ceval-exam": {
         "ms_id": "opencompass/ceval-exam",
@@ -691,6 +697,10 @@ DATASETS_URL = {
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
         "md5": "9107597d137e7362eaf7d218ddef7a6d",
     },
+    "/bbeh": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip",
+        "md5": "43a3c2d73aee731ac68ac790bc9a358e",
+    },
     "subjective/judgerbench": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",