From bc2969dba8b97e8caef54c5ae98d02a4af2f17b5 Mon Sep 17 00:00:00 2001 From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com> Date: Wed, 12 Mar 2025 10:53:31 +0800 Subject: [PATCH] [Feature] Add support for BBEH dataset (#1925) * bbeh * bbeh * fix_smallbugs_bbeh * removeprint * results --------- Co-authored-by: yufeng zhao --- dataset-index.yml | 5 + opencompass/configs/datasets/bbeh/README.md | 26 +++ opencompass/configs/datasets/bbeh/bbeh_gen.py | 93 +++++++++++ .../configs/summarizers/groups/bbeh.py | 12 ++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/bbeh.py | 149 ++++++++++++++++++ opencompass/utils/datasets_info.py | 10 ++ 7 files changed, 296 insertions(+) create mode 100644 opencompass/configs/datasets/bbeh/README.md create mode 100644 opencompass/configs/datasets/bbeh/bbeh_gen.py create mode 100644 opencompass/configs/summarizers/groups/bbeh.py create mode 100644 opencompass/datasets/bbeh.py diff --git a/dataset-index.yml b/dataset-index.yml index f72e7362..e998f65f 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -234,6 +234,11 @@ category: Reasoning paper: https://arxiv.org/pdf/2210.09261 configpath: opencompass/configs/datasets/bbh +- bbeh: + name: BIG-Bench Extra Hard + category: Reasoning + paper: https://arxiv.org/abs/2502.19187 + configpath: opencompass/configs/datasets/bbeh - BoolQ: name: SuperGLUE / BoolQ category: Knowledge diff --git a/opencompass/configs/datasets/bbeh/README.md b/opencompass/configs/datasets/bbeh/README.md new file mode 100644 index 00000000..1fd034ff --- /dev/null +++ b/opencompass/configs/datasets/bbeh/README.md @@ -0,0 +1,26 @@ +# BB#H + +```bash +python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug +python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug +``` + +## Models + +| model | score | +|:-----------------------------------------:|------:| +| Meta-Llama-3-8B-Instruct-LMDeploy-API | 10.93 | + +### Details + +| model | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa | +|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:| +| Meta-Llama-3-8B-Instruct-LMDeploy-API | 14.00 | 33.33 | 13.50 | 1.00 | 28.00 | 11.00 | 10.00 | 18.50 | + +| model | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples | +|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:| +| Meta-Llama-3-8B-Instruct-LMDeploy-API | 0.00 | 42.50 | 3.50 | 2.00 | 0.00 | 0.00 | 1.00 | 17.00 | + +| model | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles | +|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:| +| Meta-Llama-3-8B-Instruct-LMDeploy-API | 4.00 | 5.00 | 2.00 | 3.00 | 7.50 | 2.00 | 3.50 | diff --git a/opencompass/configs/datasets/bbeh/bbeh_gen.py b/opencompass/configs/datasets/bbeh/bbeh_gen.py new file mode 100644 index 00000000..522ade24 --- /dev/null +++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py @@ -0,0 +1,93 @@ +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq + +bbeh_reader_cfg = dict(input_columns=['input'], output_column='target') + + +bbeh_multiple_choice_sets = [ + 'bbeh_boolean_expressions', + 'bbeh_disambiguation_qa', + 'bbeh_geometric_shapes', + 'bbeh_hyperbaton', + 'bbeh_movie_recommendation', + 'bbeh_nycc', + 'bbeh_shuffled_objects', +] + +bbeh_free_form_sets = [ + 'bbeh_boardgame_qa', + 'bbeh_buggy_tables', + 'bbeh_causal_understanding', + 'bbeh_dyck_languages', + 'bbeh_linguini', + 'bbeh_multistep_arithmetic', + 'bbeh_object_counting', + 'bbeh_object_properties', + 'bbeh_sarc_triples', + 'bbeh_spatial_reasoning', + 'bbeh_sportqa', + 'bbeh_temporal_sequence', + 'bbeh_time_arithmetic', + 'bbeh_web_of_lies', + 'bbeh_word_sorting', + 'bbeh_zebra_puzzles', +] + +bbeh_datasets = [] +for _name in bbeh_multiple_choice_sets: + bbeh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192)) + bbeh_eval_cfg = dict( + evaluator=dict(type=BBEHEvaluator_mcq), + pred_role='BOT', + pred_postprocessor=dict(type=bbeh_mcq_postprocess), + dataset_postprocessor=dict(type=bbeh_mcq_postprocess)) + + bbeh_datasets.append( + dict( + type=BBEHDataset, + path='opencompass/bbeh', + name=_name, + abbr=_name, + reader_cfg=bbeh_reader_cfg, + infer_cfg=bbeh_infer_cfg.copy(), + eval_cfg=bbeh_eval_cfg.copy())) + +for _name in bbeh_free_form_sets: + bbeh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192)) + bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess)) + + bbeh_datasets.append( + dict( + type=BBEHDataset, + path='opencompass/bbeh', + name=_name, + abbr=_name, + reader_cfg=bbeh_reader_cfg, + infer_cfg=bbeh_infer_cfg.copy(), + eval_cfg=bbeh_eval_cfg.copy())) \ No newline at end of file diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py new file mode 100644 index 00000000..5e5cc222 --- /dev/null +++ b/opencompass/configs/summarizers/groups/bbeh.py @@ -0,0 +1,12 @@ +bbeh_summary_groups = [] + +# bbeh +_bbeh = [ + 'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton', + 'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa', + 'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini', + 'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples', + 'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic', + 'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles' +] +bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh}) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index ffcc217d..6d135f61 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -9,6 +9,7 @@ from .arc import * # noqa: F401, F403 from .arc_prize_public_evaluation import * # noqa: F401, F403 from .ax import * # noqa: F401, F403 from .babilong import * # noqa: F401, F403 +from .bbeh import * # noqa: F401, F403 from .bbh import * # noqa: F401, F403 from .bigcodebench import * # noqa: F401, F403 from .boolq import * # noqa: F401, F403 diff --git a/opencompass/datasets/bbeh.py b/opencompass/datasets/bbeh.py new file mode 100644 index 00000000..0b3a49a7 --- /dev/null +++ b/opencompass/datasets/bbeh.py @@ -0,0 +1,149 @@ +import json +import os.path as osp +import re +from os import environ + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class BBEHDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path) + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + dataset = MsDataset.load(path, subset_name=name, split='test') + else: + with open(osp.join(path, f'{name}/task.json'), 'r') as f: + data = json.load(f)['examples'] + dataset = Dataset.from_list(data) + return dataset + + +@TEXT_POSTPROCESSORS.register_module('bbeh_freeform') +def bbeh_freeform_postprocess(text: str) -> str: + # Extract answer using specified prefixes + prefixes = [ + 'The answer is: ', 'The answer is ', 'The final answer is: ', + 'The final answer is ' + ] + answer = text + for prefix in prefixes: + if prefix in text: + answer = text.split(prefix)[-1] + break + + # Remove formatting markup + if '\\boxed' in answer: + answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer) # latex box + if '\\text' in answer: + answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer) # text/texttt + if '**' in answer: + answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer) # bold + + # Take first line and clean + if '\n' in answer: + answer = answer.split('\n')[0].strip() + + return answer.strip().lower() + + +@TEXT_POSTPROCESSORS.register_module('bbeh_mcq') +def bbeh_mcq_postprocess(text: str) -> str: + # Extract answer using specified prefixes + prefixes = [ + 'The answer is: ', 'The answer is ', 'The final answer is: ', + 'The final answer is ' + ] + answer = text + for prefix in prefixes: + if prefix in text: + answer = text.split(prefix)[-1] + break + + # Remove parentheses if present + answer = answer.strip('()') + + # Take first line and clean + if '\n' in answer: + answer = answer.split('\n')[0].strip() + + return answer.strip().lower() + + +@ICL_EVALUATORS.register_module() +class BBEHEvaluator(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different length' + } + + processed_preds = [bbeh_freeform_postprocess(p) for p in predictions] + # References are already in correct format + processed_refs = [r.lower() for r in references] + + details = [] + correct_count = 0 + + for pred, ref in zip(processed_preds, processed_refs): + correct = False + + # Rule 1: Exact match + if pred == ref: + correct = True + # Rule 2: Match after removing quotes/brackets + elif pred == ref.strip("'\"()[]"): + correct = True + # Rule 4: Comma - separated answers + elif ',' in ref: + norm_pred = re.sub(r'\s*,\s*', ',', pred) + norm_ref = re.sub(r'\s*,\s*', ',', ref) + if norm_pred == norm_ref: + correct = True + + details.append({'pred': pred, 'answer': ref, 'correct': correct}) + correct_count += int(correct) + + score = (correct_count / len(predictions)) * 100 + return {'score': score, 'details': details} + + +@ICL_EVALUATORS.register_module() +class BBEHEvaluator_mcq(BaseEvaluator): + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different length' + } + + processed_preds = [bbeh_mcq_postprocess(p) for p in predictions] + # References are already in correct format + processed_refs = [r.lower().strip('()') for r in references] + + details = [] + correct_count = 0 + + for pred, ref in zip(processed_preds, processed_refs): + correct = False + + # Rule 1: Exact match + if pred == ref: + correct = True + + details.append({'pred': pred, 'answer': ref, 'correct': correct}) + correct_count += int(correct) + + score = (correct_count / len(predictions)) * 100 + return {'score': score, 'details': details} diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 79be5736..25c877c6 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -33,6 +33,12 @@ DATASETS_MAPPING = { "hf_id": "opencompass/bbh", "local": "./data/BBH/data", }, + # bbeh + "opencompass/bbeh": { + "ms_id": "", + "hf_id": "", + "local": "./data/bbeh/", + }, # C-Eval "opencompass/ceval-exam": { "ms_id": "opencompass/ceval-exam", @@ -691,6 +697,10 @@ DATASETS_URL = { "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", "md5": "9107597d137e7362eaf7d218ddef7a6d", }, + "/bbeh": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip", + "md5": "43a3c2d73aee731ac68ac790bc9a358e", + }, "subjective/judgerbench": { "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",