mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add support for BBEH dataset (#1925)
* bbeh * bbeh * fix_smallbugs_bbeh * removeprint * results --------- Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
This commit is contained in:
parent
59e49aedf1
commit
bc2969dba8
@ -234,6 +234,11 @@
|
|||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2210.09261
|
paper: https://arxiv.org/pdf/2210.09261
|
||||||
configpath: opencompass/configs/datasets/bbh
|
configpath: opencompass/configs/datasets/bbh
|
||||||
|
- bbeh:
|
||||||
|
name: BIG-Bench Extra Hard
|
||||||
|
category: Reasoning
|
||||||
|
paper: https://arxiv.org/abs/2502.19187
|
||||||
|
configpath: opencompass/configs/datasets/bbeh
|
||||||
- BoolQ:
|
- BoolQ:
|
||||||
name: SuperGLUE / BoolQ
|
name: SuperGLUE / BoolQ
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
|
26
opencompass/configs/datasets/bbeh/README.md
Normal file
26
opencompass/configs/datasets/bbeh/README.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# BB#H
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
|
||||||
|
python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
|
||||||
|
```
|
||||||
|
|
||||||
|
## Models
|
||||||
|
|
||||||
|
| model | score |
|
||||||
|
|:-----------------------------------------:|------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 10.93 |
|
||||||
|
|
||||||
|
### Details
|
||||||
|
|
||||||
|
| model | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
|
||||||
|
|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 14.00 | 33.33 | 13.50 | 1.00 | 28.00 | 11.00 | 10.00 | 18.50 |
|
||||||
|
|
||||||
|
| model | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
|
||||||
|
|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 0.00 | 42.50 | 3.50 | 2.00 | 0.00 | 0.00 | 1.00 | 17.00 |
|
||||||
|
|
||||||
|
| model | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
|
||||||
|
|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 4.00 | 5.00 | 2.00 | 3.00 | 7.50 | 2.00 | 3.50 |
|
93
opencompass/configs/datasets/bbeh/bbeh_gen.py
Normal file
93
opencompass/configs/datasets/bbeh/bbeh_gen.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import os
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
|
from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
|
||||||
|
|
||||||
|
bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||||
|
|
||||||
|
|
||||||
|
bbeh_multiple_choice_sets = [
|
||||||
|
'bbeh_boolean_expressions',
|
||||||
|
'bbeh_disambiguation_qa',
|
||||||
|
'bbeh_geometric_shapes',
|
||||||
|
'bbeh_hyperbaton',
|
||||||
|
'bbeh_movie_recommendation',
|
||||||
|
'bbeh_nycc',
|
||||||
|
'bbeh_shuffled_objects',
|
||||||
|
]
|
||||||
|
|
||||||
|
bbeh_free_form_sets = [
|
||||||
|
'bbeh_boardgame_qa',
|
||||||
|
'bbeh_buggy_tables',
|
||||||
|
'bbeh_causal_understanding',
|
||||||
|
'bbeh_dyck_languages',
|
||||||
|
'bbeh_linguini',
|
||||||
|
'bbeh_multistep_arithmetic',
|
||||||
|
'bbeh_object_counting',
|
||||||
|
'bbeh_object_properties',
|
||||||
|
'bbeh_sarc_triples',
|
||||||
|
'bbeh_spatial_reasoning',
|
||||||
|
'bbeh_sportqa',
|
||||||
|
'bbeh_temporal_sequence',
|
||||||
|
'bbeh_time_arithmetic',
|
||||||
|
'bbeh_web_of_lies',
|
||||||
|
'bbeh_word_sorting',
|
||||||
|
'bbeh_zebra_puzzles',
|
||||||
|
]
|
||||||
|
|
||||||
|
bbeh_datasets = []
|
||||||
|
for _name in bbeh_multiple_choice_sets:
|
||||||
|
bbeh_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=
|
||||||
|
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
|
||||||
|
)
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=8192))
|
||||||
|
bbeh_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=BBEHEvaluator_mcq),
|
||||||
|
pred_role='BOT',
|
||||||
|
pred_postprocessor=dict(type=bbeh_mcq_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
|
||||||
|
|
||||||
|
bbeh_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=BBEHDataset,
|
||||||
|
path='opencompass/bbeh',
|
||||||
|
name=_name,
|
||||||
|
abbr=_name,
|
||||||
|
reader_cfg=bbeh_reader_cfg,
|
||||||
|
infer_cfg=bbeh_infer_cfg.copy(),
|
||||||
|
eval_cfg=bbeh_eval_cfg.copy()))
|
||||||
|
|
||||||
|
for _name in bbeh_free_form_sets:
|
||||||
|
bbeh_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role='HUMAN',
|
||||||
|
prompt=
|
||||||
|
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
|
||||||
|
)
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=8192))
|
||||||
|
bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
|
||||||
|
|
||||||
|
bbeh_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=BBEHDataset,
|
||||||
|
path='opencompass/bbeh',
|
||||||
|
name=_name,
|
||||||
|
abbr=_name,
|
||||||
|
reader_cfg=bbeh_reader_cfg,
|
||||||
|
infer_cfg=bbeh_infer_cfg.copy(),
|
||||||
|
eval_cfg=bbeh_eval_cfg.copy()))
|
12
opencompass/configs/summarizers/groups/bbeh.py
Normal file
12
opencompass/configs/summarizers/groups/bbeh.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
bbeh_summary_groups = []
|
||||||
|
|
||||||
|
# bbeh
|
||||||
|
_bbeh = [
|
||||||
|
'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton',
|
||||||
|
'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa',
|
||||||
|
'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini',
|
||||||
|
'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples',
|
||||||
|
'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
|
||||||
|
'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
|
||||||
|
]
|
||||||
|
bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
|
@ -9,6 +9,7 @@ from .arc import * # noqa: F401, F403
|
|||||||
from .arc_prize_public_evaluation import * # noqa: F401, F403
|
from .arc_prize_public_evaluation import * # noqa: F401, F403
|
||||||
from .ax import * # noqa: F401, F403
|
from .ax import * # noqa: F401, F403
|
||||||
from .babilong import * # noqa: F401, F403
|
from .babilong import * # noqa: F401, F403
|
||||||
|
from .bbeh import * # noqa: F401, F403
|
||||||
from .bbh import * # noqa: F401, F403
|
from .bbh import * # noqa: F401, F403
|
||||||
from .bigcodebench import * # noqa: F401, F403
|
from .bigcodebench import * # noqa: F401, F403
|
||||||
from .boolq import * # noqa: F401, F403
|
from .boolq import * # noqa: F401, F403
|
||||||
|
149
opencompass/datasets/bbeh.py
Normal file
149
opencompass/datasets/bbeh.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
import json
|
||||||
|
import os.path as osp
|
||||||
|
import re
|
||||||
|
from os import environ
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
|
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
|
||||||
|
TEXT_POSTPROCESSORS)
|
||||||
|
from opencompass.utils import get_data_path
|
||||||
|
|
||||||
|
from .base import BaseDataset
|
||||||
|
|
||||||
|
|
||||||
|
@LOAD_DATASET.register_module()
|
||||||
|
class BBEHDataset(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path: str, name: str):
|
||||||
|
path = get_data_path(path)
|
||||||
|
if environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||||
|
from modelscope import MsDataset
|
||||||
|
dataset = MsDataset.load(path, subset_name=name, split='test')
|
||||||
|
else:
|
||||||
|
with open(osp.join(path, f'{name}/task.json'), 'r') as f:
|
||||||
|
data = json.load(f)['examples']
|
||||||
|
dataset = Dataset.from_list(data)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('bbeh_freeform')
|
||||||
|
def bbeh_freeform_postprocess(text: str) -> str:
|
||||||
|
# Extract answer using specified prefixes
|
||||||
|
prefixes = [
|
||||||
|
'The answer is: ', 'The answer is ', 'The final answer is: ',
|
||||||
|
'The final answer is '
|
||||||
|
]
|
||||||
|
answer = text
|
||||||
|
for prefix in prefixes:
|
||||||
|
if prefix in text:
|
||||||
|
answer = text.split(prefix)[-1]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Remove formatting markup
|
||||||
|
if '\\boxed' in answer:
|
||||||
|
answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer) # latex box
|
||||||
|
if '\\text' in answer:
|
||||||
|
answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer) # text/texttt
|
||||||
|
if '**' in answer:
|
||||||
|
answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer) # bold
|
||||||
|
|
||||||
|
# Take first line and clean
|
||||||
|
if '\n' in answer:
|
||||||
|
answer = answer.split('\n')[0].strip()
|
||||||
|
|
||||||
|
return answer.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('bbeh_mcq')
|
||||||
|
def bbeh_mcq_postprocess(text: str) -> str:
|
||||||
|
# Extract answer using specified prefixes
|
||||||
|
prefixes = [
|
||||||
|
'The answer is: ', 'The answer is ', 'The final answer is: ',
|
||||||
|
'The final answer is '
|
||||||
|
]
|
||||||
|
answer = text
|
||||||
|
for prefix in prefixes:
|
||||||
|
if prefix in text:
|
||||||
|
answer = text.split(prefix)[-1]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Remove parentheses if present
|
||||||
|
answer = answer.strip('()')
|
||||||
|
|
||||||
|
# Take first line and clean
|
||||||
|
if '\n' in answer:
|
||||||
|
answer = answer.split('\n')[0].strip()
|
||||||
|
|
||||||
|
return answer.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_EVALUATORS.register_module()
|
||||||
|
class BBEHEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
|
def score(self, predictions, references):
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error': 'predictions and references have different length'
|
||||||
|
}
|
||||||
|
|
||||||
|
processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
|
||||||
|
# References are already in correct format
|
||||||
|
processed_refs = [r.lower() for r in references]
|
||||||
|
|
||||||
|
details = []
|
||||||
|
correct_count = 0
|
||||||
|
|
||||||
|
for pred, ref in zip(processed_preds, processed_refs):
|
||||||
|
correct = False
|
||||||
|
|
||||||
|
# Rule 1: Exact match
|
||||||
|
if pred == ref:
|
||||||
|
correct = True
|
||||||
|
# Rule 2: Match after removing quotes/brackets
|
||||||
|
elif pred == ref.strip("'\"()[]"):
|
||||||
|
correct = True
|
||||||
|
# Rule 4: Comma - separated answers
|
||||||
|
elif ',' in ref:
|
||||||
|
norm_pred = re.sub(r'\s*,\s*', ',', pred)
|
||||||
|
norm_ref = re.sub(r'\s*,\s*', ',', ref)
|
||||||
|
if norm_pred == norm_ref:
|
||||||
|
correct = True
|
||||||
|
|
||||||
|
details.append({'pred': pred, 'answer': ref, 'correct': correct})
|
||||||
|
correct_count += int(correct)
|
||||||
|
|
||||||
|
score = (correct_count / len(predictions)) * 100
|
||||||
|
return {'score': score, 'details': details}
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_EVALUATORS.register_module()
|
||||||
|
class BBEHEvaluator_mcq(BaseEvaluator):
|
||||||
|
|
||||||
|
def score(self, predictions, references):
|
||||||
|
if len(predictions) != len(references):
|
||||||
|
return {
|
||||||
|
'error': 'predictions and references have different length'
|
||||||
|
}
|
||||||
|
|
||||||
|
processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
|
||||||
|
# References are already in correct format
|
||||||
|
processed_refs = [r.lower().strip('()') for r in references]
|
||||||
|
|
||||||
|
details = []
|
||||||
|
correct_count = 0
|
||||||
|
|
||||||
|
for pred, ref in zip(processed_preds, processed_refs):
|
||||||
|
correct = False
|
||||||
|
|
||||||
|
# Rule 1: Exact match
|
||||||
|
if pred == ref:
|
||||||
|
correct = True
|
||||||
|
|
||||||
|
details.append({'pred': pred, 'answer': ref, 'correct': correct})
|
||||||
|
correct_count += int(correct)
|
||||||
|
|
||||||
|
score = (correct_count / len(predictions)) * 100
|
||||||
|
return {'score': score, 'details': details}
|
@ -33,6 +33,12 @@ DATASETS_MAPPING = {
|
|||||||
"hf_id": "opencompass/bbh",
|
"hf_id": "opencompass/bbh",
|
||||||
"local": "./data/BBH/data",
|
"local": "./data/BBH/data",
|
||||||
},
|
},
|
||||||
|
# bbeh
|
||||||
|
"opencompass/bbeh": {
|
||||||
|
"ms_id": "",
|
||||||
|
"hf_id": "",
|
||||||
|
"local": "./data/bbeh/",
|
||||||
|
},
|
||||||
# C-Eval
|
# C-Eval
|
||||||
"opencompass/ceval-exam": {
|
"opencompass/ceval-exam": {
|
||||||
"ms_id": "opencompass/ceval-exam",
|
"ms_id": "opencompass/ceval-exam",
|
||||||
@ -691,6 +697,10 @@ DATASETS_URL = {
|
|||||||
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
|
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
|
||||||
"md5": "9107597d137e7362eaf7d218ddef7a6d",
|
"md5": "9107597d137e7362eaf7d218ddef7a6d",
|
||||||
},
|
},
|
||||||
|
"/bbeh": {
|
||||||
|
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip",
|
||||||
|
"md5": "43a3c2d73aee731ac68ac790bc9a358e",
|
||||||
|
},
|
||||||
"subjective/judgerbench": {
|
"subjective/judgerbench": {
|
||||||
"url":
|
"url":
|
||||||
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",
|
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",
|
||||||
|
Loading…
Reference in New Issue
Block a user