mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix_smallbugs_bbeh
This commit is contained in:
parent
9f491fa2d1
commit
d99179d0ef
@ -237,7 +237,7 @@
|
|||||||
- bbeh:
|
- bbeh:
|
||||||
name: BIG-Bench Extra Hard
|
name: BIG-Bench Extra Hard
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper:https://arxiv.org/abs/2502.19187
|
paper: https://arxiv.org/abs/2502.19187
|
||||||
configpath: opencompass/configs/datasets/bbeh
|
configpath: opencompass/configs/datasets/bbeh
|
||||||
- BoolQ:
|
- BoolQ:
|
||||||
name: SuperGLUE / BoolQ
|
name: SuperGLUE / BoolQ
|
||||||
|
@ -1,23 +0,0 @@
|
|||||||
import os
|
|
||||||
import huggingface_hub.constants as hf_constants
|
|
||||||
from huggingface_hub import set_cache_dir
|
|
||||||
from datasets import get_dataset_config_names # Optional, if you need dataset-related functionality
|
|
||||||
|
|
||||||
# Set a new cache directory
|
|
||||||
new_cache_dir = "/fs-computility/llm/shared/llmeval/models/opencompass_hf_hub" # Replace with your desired path
|
|
||||||
set_cache_dir(new_cache_dir)
|
|
||||||
|
|
||||||
# Alternatively, you can set the environment variable
|
|
||||||
# os.environ["HF_HOME"] = new_cache_dir
|
|
||||||
|
|
||||||
# Root cache path for Hugging Face
|
|
||||||
root_cache_dir = hf_constants.HF_HOME
|
|
||||||
print(f"Root HF cache path: {root_cache_dir}")
|
|
||||||
|
|
||||||
# Dataset cache path (typically under HF_HOME/datasets)
|
|
||||||
dataset_cache_dir = f"{root_cache_dir}/datasets"
|
|
||||||
print(f"Dataset cache path: {dataset_cache_dir}")
|
|
||||||
|
|
||||||
# Model cache path (typically under HF_HOME/hub)
|
|
||||||
model_cache_dir = f"{root_cache_dir}/hub"
|
|
||||||
print(f"Model cache path: {model_cache_dir}")
|
|
26
opencompass/configs/datasets/bbeh/README.md
Normal file
26
opencompass/configs/datasets/bbeh/README.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# BB#H
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
|
||||||
|
python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
|
||||||
|
```
|
||||||
|
|
||||||
|
## Models
|
||||||
|
|
||||||
|
| model | score |
|
||||||
|
|:-----------------------------------------:|------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 10.93 |
|
||||||
|
|
||||||
|
### Details
|
||||||
|
|
||||||
|
| model | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
|
||||||
|
|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 14.00 | 33.33 | 13.50 | 1.00 | 28.00 | 11.00 | 10.00 | 18.50 |
|
||||||
|
|
||||||
|
| model | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
|
||||||
|
|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 0.00 | 42.50 | 3.50 | 2.00 | 0.00 | 0.00 | 1.00 | 17.00 |
|
||||||
|
|
||||||
|
| model | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
|
||||||
|
|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
|
||||||
|
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 4.00 | 5.00 | 2.00 | 3.00 | 7.50 | 2.00 | 3.50 |
|
@ -9,8 +9,8 @@ from .arc import * # noqa: F401, F403
|
|||||||
from .arc_prize_public_evaluation import * # noqa: F401, F403
|
from .arc_prize_public_evaluation import * # noqa: F401, F403
|
||||||
from .ax import * # noqa: F401, F403
|
from .ax import * # noqa: F401, F403
|
||||||
from .babilong import * # noqa: F401, F403
|
from .babilong import * # noqa: F401, F403
|
||||||
from .bbh import * # noqa: F401, F403
|
|
||||||
from .bbeh import * # noqa: F401, F403
|
from .bbeh import * # noqa: F401, F403
|
||||||
|
from .bbh import * # noqa: F401, F403
|
||||||
from .bigcodebench import * # noqa: F401, F403
|
from .bigcodebench import * # noqa: F401, F403
|
||||||
from .boolq import * # noqa: F401, F403
|
from .boolq import * # noqa: F401, F403
|
||||||
from .bustum import * # noqa: F401, F403
|
from .bustum import * # noqa: F401, F403
|
||||||
|
@ -15,6 +15,7 @@ from .base import BaseDataset
|
|||||||
|
|
||||||
@LOAD_DATASET.register_module()
|
@LOAD_DATASET.register_module()
|
||||||
class BBEHDataset(BaseDataset):
|
class BBEHDataset(BaseDataset):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path: str, name: str):
|
def load(path: str, name: str):
|
||||||
path = get_data_path(path)
|
path = get_data_path(path)
|
||||||
@ -32,9 +33,7 @@ class BBEHDataset(BaseDataset):
|
|||||||
def bbeh_freeform_postprocess(text: str) -> str:
|
def bbeh_freeform_postprocess(text: str) -> str:
|
||||||
# Extract answer using specified prefixes
|
# Extract answer using specified prefixes
|
||||||
prefixes = [
|
prefixes = [
|
||||||
'The answer is: ',
|
'The answer is: ', 'The answer is ', 'The final answer is: ',
|
||||||
'The answer is ',
|
|
||||||
'The final answer is: ',
|
|
||||||
'The final answer is '
|
'The final answer is '
|
||||||
]
|
]
|
||||||
answer = text
|
answer = text
|
||||||
@ -62,9 +61,7 @@ def bbeh_freeform_postprocess(text: str) -> str:
|
|||||||
def bbeh_mcq_postprocess(text: str) -> str:
|
def bbeh_mcq_postprocess(text: str) -> str:
|
||||||
# Extract answer using specified prefixes
|
# Extract answer using specified prefixes
|
||||||
prefixes = [
|
prefixes = [
|
||||||
'The answer is: ',
|
'The answer is: ', 'The answer is ', 'The final answer is: ',
|
||||||
'The answer is ',
|
|
||||||
'The final answer is: ',
|
|
||||||
'The final answer is '
|
'The final answer is '
|
||||||
]
|
]
|
||||||
answer = text
|
answer = text
|
||||||
@ -85,12 +82,16 @@ def bbeh_mcq_postprocess(text: str) -> str:
|
|||||||
|
|
||||||
@ICL_EVALUATORS.register_module()
|
@ICL_EVALUATORS.register_module()
|
||||||
class BBEHEvaluator(BaseEvaluator):
|
class BBEHEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
def score(self, predictions, references):
|
def score(self, predictions, references):
|
||||||
if len(predictions) != len(references):
|
if len(predictions) != len(references):
|
||||||
return {'error': 'predictions and references have different length'}
|
return {
|
||||||
|
'error': 'predictions and references have different length'
|
||||||
|
}
|
||||||
|
|
||||||
processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
|
processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
|
||||||
processed_refs = [r.lower() for r in references] # References are already in correct format
|
# References are already in correct format
|
||||||
|
processed_refs = [r.lower() for r in references]
|
||||||
|
|
||||||
details = []
|
details = []
|
||||||
correct_count = 0
|
correct_count = 0
|
||||||
@ -111,11 +112,7 @@ class BBEHEvaluator(BaseEvaluator):
|
|||||||
if norm_pred == norm_ref:
|
if norm_pred == norm_ref:
|
||||||
correct = True
|
correct = True
|
||||||
|
|
||||||
details.append({
|
details.append({'pred': pred, 'answer': ref, 'correct': correct})
|
||||||
'pred': pred,
|
|
||||||
'answer': ref,
|
|
||||||
'correct': correct
|
|
||||||
})
|
|
||||||
correct_count += int(correct)
|
correct_count += int(correct)
|
||||||
|
|
||||||
score = (correct_count / len(predictions)) * 100
|
score = (correct_count / len(predictions)) * 100
|
||||||
@ -124,12 +121,16 @@ class BBEHEvaluator(BaseEvaluator):
|
|||||||
|
|
||||||
@ICL_EVALUATORS.register_module()
|
@ICL_EVALUATORS.register_module()
|
||||||
class BBEHEvaluator_mcq(BaseEvaluator):
|
class BBEHEvaluator_mcq(BaseEvaluator):
|
||||||
|
|
||||||
def score(self, predictions, references):
|
def score(self, predictions, references):
|
||||||
if len(predictions) != len(references):
|
if len(predictions) != len(references):
|
||||||
return {'error': 'predictions and references have different length'}
|
return {
|
||||||
|
'error': 'predictions and references have different length'
|
||||||
|
}
|
||||||
|
|
||||||
processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
|
processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
|
||||||
processed_refs = [r.lower().strip('()') for r in references] # References are already in correct format
|
# References are already in correct format
|
||||||
|
processed_refs = [r.lower().strip('()') for r in references]
|
||||||
|
|
||||||
details = []
|
details = []
|
||||||
correct_count = 0
|
correct_count = 0
|
||||||
@ -141,11 +142,7 @@ class BBEHEvaluator_mcq(BaseEvaluator):
|
|||||||
if pred == ref:
|
if pred == ref:
|
||||||
correct = True
|
correct = True
|
||||||
|
|
||||||
details.append({
|
details.append({'pred': pred, 'answer': ref, 'correct': correct})
|
||||||
'pred': pred,
|
|
||||||
'answer': ref,
|
|
||||||
'correct': correct
|
|
||||||
})
|
|
||||||
correct_count += int(correct)
|
correct_count += int(correct)
|
||||||
|
|
||||||
score = (correct_count / len(predictions)) * 100
|
score = (correct_count / len(predictions)) * 100
|
||||||
|
Loading…
Reference in New Issue
Block a user