[Feature] Add support for BBEH dataset (#1925)

* bbeh * bbeh * fix_smallbugs_bbeh * removeprint * results --------- Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
2025-05-30 16:03:24 +08:00 · 2025-03-12 10:53:31 +08:00 · 2025-03-12 10:53:31 +08:00 · bc2969dba8
commit bc2969dba8
parent 59e49aedf1
7 changed files with 296 additions and 0 deletions
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -234,6 +234,11 @@
    category: Reasoning
    paper: https://arxiv.org/pdf/2210.09261
    configpath: opencompass/configs/datasets/bbh
 - bbeh:
    name: BIG-Bench Extra Hard
    category: Reasoning
    paper: https://arxiv.org/abs/2502.19187
    configpath: opencompass/configs/datasets/bbeh
 - BoolQ:
    name: SuperGLUE / BoolQ
    category: Knowledge
--- a/opencompass/configs/datasets/bbeh/README.md
+++ b/opencompass/configs/datasets/bbeh/README.md
@ -0,0 +1,26 @@
 # BB#H
 ```bash
 python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
 python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
 ```
 ## Models
 |                   model                    | score |
 |:-----------------------------------------:|------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
 ### Details
 |                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
 |:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
 |                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
 |:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
 |                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
 |:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
--- a/opencompass/configs/datasets/bbeh/bbeh_gen.py
+++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py
@ -0,0 +1,93 @@
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
 bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbeh_multiple_choice_sets = [
    'bbeh_boolean_expressions',
    'bbeh_disambiguation_qa',
    'bbeh_geometric_shapes',
    'bbeh_hyperbaton',
    'bbeh_movie_recommendation',
    'bbeh_nycc',
    'bbeh_shuffled_objects',
 ]
 bbeh_free_form_sets = [
    'bbeh_boardgame_qa',
    'bbeh_buggy_tables',
    'bbeh_causal_understanding',
    'bbeh_dyck_languages',
    'bbeh_linguini',
    'bbeh_multistep_arithmetic',
    'bbeh_object_counting',
    'bbeh_object_properties',
    'bbeh_sarc_triples',
    'bbeh_spatial_reasoning',
    'bbeh_sportqa',
    'bbeh_temporal_sequence',
    'bbeh_time_arithmetic',
    'bbeh_web_of_lies',
    'bbeh_word_sorting',
    'bbeh_zebra_puzzles',
 ]
 bbeh_datasets = []
 for _name in bbeh_multiple_choice_sets:
    bbeh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=8192))
    bbeh_eval_cfg = dict(
        evaluator=dict(type=BBEHEvaluator_mcq),
        pred_role='BOT',
        pred_postprocessor=dict(type=bbeh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
    bbeh_datasets.append(
        dict(
            type=BBEHDataset,
            path='opencompass/bbeh',
            name=_name,
            abbr=_name,
            reader_cfg=bbeh_reader_cfg,
            infer_cfg=bbeh_infer_cfg.copy(),
            eval_cfg=bbeh_eval_cfg.copy()))
 for _name in bbeh_free_form_sets:
    bbeh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=8192))
    bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
    bbeh_datasets.append(
        dict(
            type=BBEHDataset,
            path='opencompass/bbeh',
            name=_name,
            abbr=_name,
            reader_cfg=bbeh_reader_cfg,
            infer_cfg=bbeh_infer_cfg.copy(),
            eval_cfg=bbeh_eval_cfg.copy()))
--- a/opencompass/configs/summarizers/groups/bbeh.py
+++ b/opencompass/configs/summarizers/groups/bbeh.py
@ -0,0 +1,12 @@
 bbeh_summary_groups = []
 # bbeh
 _bbeh = [
    'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton',
    'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa',
    'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini',
    'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples',
    'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
    'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
 ]
 bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -9,6 +9,7 @@ from .arc import *  # noqa: F401, F403
 from .arc_prize_public_evaluation import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
 from .babilong import *  # noqa: F401, F403
 from .bbeh import *  # noqa: F401, F403
 from .bbh import *  # noqa: F401, F403
 from .bigcodebench import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
--- a/opencompass/datasets/bbeh.py
+++ b/opencompass/datasets/bbeh.py
@ -0,0 +1,149 @@
 import json
 import os.path as osp
 import re
 from os import environ
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
                                  TEXT_POSTPROCESSORS)
 from opencompass.utils import get_data_path
 from .base import BaseDataset
@LOAD_DATASET.register_module()
 class BBEHDataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
        path = get_data_path(path)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
            dataset = MsDataset.load(path, subset_name=name, split='test')
        else:
            with open(osp.join(path, f'{name}/task.json'), 'r') as f:
                data = json.load(f)['examples']
            dataset = Dataset.from_list(data)
        return dataset
@TEXT_POSTPROCESSORS.register_module('bbeh_freeform')
 def bbeh_freeform_postprocess(text: str) -> str:
    # Extract answer using specified prefixes
    prefixes = [
        'The answer is: ', 'The answer is ', 'The final answer is: ',
        'The final answer is '
    ]
    answer = text
    for prefix in prefixes:
        if prefix in text:
            answer = text.split(prefix)[-1]
            break
    # Remove formatting markup
    if '\\boxed' in answer:
        answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer)  # latex box
    if '\\text' in answer:
        answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer)  # text/texttt
    if '**' in answer:
        answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer)  # bold
    # Take first line and clean
    if '\n' in answer:
        answer = answer.split('\n')[0].strip()
    return answer.strip().lower()
@TEXT_POSTPROCESSORS.register_module('bbeh_mcq')
 def bbeh_mcq_postprocess(text: str) -> str:
    # Extract answer using specified prefixes
    prefixes = [
        'The answer is: ', 'The answer is ', 'The final answer is: ',
        'The final answer is '
    ]
    answer = text
    for prefix in prefixes:
        if prefix in text:
            answer = text.split(prefix)[-1]
            break
    # Remove parentheses if present
    answer = answer.strip('()')
    # Take first line and clean
    if '\n' in answer:
        answer = answer.split('\n')[0].strip()
    return answer.strip().lower()
@ICL_EVALUATORS.register_module()
 class BBEHEvaluator(BaseEvaluator):
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different length'
            }
        processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
        # References are already in correct format
        processed_refs = [r.lower() for r in references]
        details = []
        correct_count = 0
        for pred, ref in zip(processed_preds, processed_refs):
            correct = False
            # Rule 1: Exact match
            if pred == ref:
                correct = True
            # Rule 2: Match after removing quotes/brackets
            elif pred == ref.strip("'\"()[]"):
                correct = True
            # Rule 4: Comma - separated answers
            elif ',' in ref:
                norm_pred = re.sub(r'\s*,\s*', ',', pred)
                norm_ref = re.sub(r'\s*,\s*', ',', ref)
                if norm_pred == norm_ref:
                    correct = True
            details.append({'pred': pred, 'answer': ref, 'correct': correct})
            correct_count += int(correct)
        score = (correct_count / len(predictions)) * 100
        return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
 class BBEHEvaluator_mcq(BaseEvaluator):
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different length'
            }
        processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
        # References are already in correct format
        processed_refs = [r.lower().strip('()') for r in references]
        details = []
        correct_count = 0
        for pred, ref in zip(processed_preds, processed_refs):
            correct = False
            # Rule 1: Exact match
            if pred == ref:
                correct = True
            details.append({'pred': pred, 'answer': ref, 'correct': correct})
            correct_count += int(correct)
        score = (correct_count / len(predictions)) * 100
        return {'score': score, 'details': details}
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -33,6 +33,12 @@ DATASETS_MAPPING = {
        "hf_id": "opencompass/bbh",
        "local": "./data/BBH/data",
    },
    # bbeh
    "opencompass/bbeh": {
        "ms_id": "",
        "hf_id": "",
        "local": "./data/bbeh/",
    },
    # C-Eval
    "opencompass/ceval-exam": {
        "ms_id": "opencompass/ceval-exam",
@ -691,6 +697,10 @@ DATASETS_URL = {
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
        "md5": "9107597d137e7362eaf7d218ddef7a6d",
    },
    "/bbeh": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip",
        "md5": "43a3c2d73aee731ac68ac790bc9a358e",
    },
    "subjective/judgerbench": {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",