diff --git a/dataset-index.yml b/dataset-index.yml
index d13abaab..872e420e 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -237,7 +237,7 @@
 - bbeh:
     name: BIG-Bench Extra Hard
     category: Reasoning
-    paper:https://arxiv.org/abs/2502.19187
+    paper: https://arxiv.org/abs/2502.19187
     configpath: opencompass/configs/datasets/bbeh
 - BoolQ:
     name: SuperGLUE / BoolQ
diff --git a/hf_settings.py b/hf_settings.py
deleted file mode 100644
index c03490ba..00000000
--- a/hf_settings.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import os
-import huggingface_hub.constants as hf_constants
-from huggingface_hub import set_cache_dir
-from datasets import get_dataset_config_names  # Optional, if you need dataset-related functionality
-
-# Set a new cache directory
-new_cache_dir = "/fs-computility/llm/shared/llmeval/models/opencompass_hf_hub"  # Replace with your desired path
-set_cache_dir(new_cache_dir)
-
-# Alternatively, you can set the environment variable
-# os.environ["HF_HOME"] = new_cache_dir
-
-# Root cache path for Hugging Face
-root_cache_dir = hf_constants.HF_HOME
-print(f"Root HF cache path: {root_cache_dir}")
-
-# Dataset cache path (typically under HF_HOME/datasets)
-dataset_cache_dir = f"{root_cache_dir}/datasets"
-print(f"Dataset cache path: {dataset_cache_dir}")
-
-# Model cache path (typically under HF_HOME/hub)
-model_cache_dir = f"{root_cache_dir}/hub"
-print(f"Model cache path: {model_cache_dir}")
\ No newline at end of file
diff --git a/opencompass/configs/datasets/bbeh/README.md b/opencompass/configs/datasets/bbeh/README.md
new file mode 100644
index 00000000..1fd034ff
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/README.md
@@ -0,0 +1,26 @@
+# BB#H
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
+python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
+```
+
+## Models
+
+|                   model                    | score |
+|:-----------------------------------------:|------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
+
+### Details
+
+|                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
+|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
+
+|                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
+|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
+
+|                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
+|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 1a67e252..9899c7d7 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -9,8 +9,8 @@ from .arc import *  # noqa: F401, F403
 from .arc_prize_public_evaluation import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
 from .babilong import *  # noqa: F401, F403
-from .bbh import *  # noqa: F401, F403
 from .bbeh import *  # noqa: F401, F403
+from .bbh import *  # noqa: F401, F403
 from .bigcodebench import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
 from .bustum import *  # noqa: F401, F403
diff --git a/opencompass/datasets/bbeh.py b/opencompass/datasets/bbeh.py
index d00cabec..0b3a49a7 100644
--- a/opencompass/datasets/bbeh.py
+++ b/opencompass/datasets/bbeh.py
@@ -15,6 +15,7 @@ from .base import BaseDataset
 
 @LOAD_DATASET.register_module()
 class BBEHDataset(BaseDataset):
+
     @staticmethod
     def load(path: str, name: str):
         path = get_data_path(path)
@@ -32,9 +33,7 @@ class BBEHDataset(BaseDataset):
 def bbeh_freeform_postprocess(text: str) -> str:
     # Extract answer using specified prefixes
     prefixes = [
-        'The answer is: ',
-        'The answer is ',
-        'The final answer is: ',
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
         'The final answer is '
     ]
     answer = text
@@ -62,9 +61,7 @@ def bbeh_freeform_postprocess(text: str) -> str:
 def bbeh_mcq_postprocess(text: str) -> str:
     # Extract answer using specified prefixes
     prefixes = [
-        'The answer is: ',
-        'The answer is ',
-        'The final answer is: ',
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
         'The final answer is '
     ]
     answer = text
@@ -85,12 +82,16 @@ def bbeh_mcq_postprocess(text: str) -> str:
 
 @ICL_EVALUATORS.register_module()
 class BBEHEvaluator(BaseEvaluator):
+
     def score(self, predictions, references):
         if len(predictions) != len(references):
-            return {'error': 'predictions and references have different length'}
+            return {
+                'error': 'predictions and references have different length'
+            }
 
         processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
-        processed_refs = [r.lower() for r in references]  # References are already in correct format
+        # References are already in correct format
+        processed_refs = [r.lower() for r in references]
 
         details = []
         correct_count = 0
@@ -111,11 +112,7 @@ class BBEHEvaluator(BaseEvaluator):
                 if norm_pred == norm_ref:
                     correct = True
 
-            details.append({
-                'pred': pred,
-                'answer': ref,
-                'correct': correct
-            })
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
             correct_count += int(correct)
 
         score = (correct_count / len(predictions)) * 100
@@ -124,12 +121,16 @@ class BBEHEvaluator(BaseEvaluator):
 
 @ICL_EVALUATORS.register_module()
 class BBEHEvaluator_mcq(BaseEvaluator):
+
     def score(self, predictions, references):
         if len(predictions) != len(references):
-            return {'error': 'predictions and references have different length'}
+            return {
+                'error': 'predictions and references have different length'
+            }
 
         processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
-        processed_refs = [r.lower().strip('()') for r in references]  # References are already in correct format
+        # References are already in correct format
+        processed_refs = [r.lower().strip('()') for r in references]
 
         details = []
         correct_count = 0
@@ -141,12 +142,8 @@ class BBEHEvaluator_mcq(BaseEvaluator):
             if pred == ref:
                 correct = True
 
-            details.append({
-                'pred': pred,
-                'answer': ref,
-                'correct': correct
-            })
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
             correct_count += int(correct)
 
         score = (correct_count / len(predictions)) * 100
-        return {'score': score, 'details': details}
\ No newline at end of file
+        return {'score': score, 'details': details}