Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2025-04-09 16:26:07 +08:00 · 2025-04-09 16:26:07 +08:00 · 975e4bcadf
commit 975e4bcadf
parent 954e37e1ee 12213207b6
260 changed files with 14112 additions and 2840 deletions
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -25,8 +25,8 @@ models = [
        type=OpenAISDK,
        key='EMPTY',
        openai_api_base='http://localhost:23333/v1',
-        path='internlm2',
-        tokenizer_path='internlm/internlm2_5-7b-chat',
+        path='internlm3',
+        tokenizer_path='internlm/internlm3-8b-instruct',
        rpm_verbose=True,
        meta_template=api_meta_template,
        query_per_second=128,
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -11,18 +11,10 @@ with read_base():
    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
        winogrande_datasets  # noqa: F401, E501
    # read hf models - chat models
-    from opencompass.configs.models.chatglm.hf_glm4_9b import \
-        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
        models as hf_deepseek_7b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
-        models as hf_deepseek_67b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
-        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
-        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -49,12 +41,6 @@ with read_base():
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
-        models as hf_internlm2_20b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
-        models as hf_internlm2_base_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
-        models as hf_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@ -65,14 +51,14 @@ with read_base():
        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
+        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
-        models as hf_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -15,14 +15,24 @@ with read_base():
        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
-        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
-        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
-        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
+        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
+        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@ -45,6 +55,8 @@ with read_base():
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
+        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@ -57,6 +69,8 @@ with read_base():
        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
+        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@ -83,10 +97,6 @@ with read_base():
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
-        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
@ -95,14 +105,19 @@ with read_base():
    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
        models as \
        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
+        models as \
+        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
+        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
-        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.phi.hf_phi_4 import \
+        models as hf_phi_4_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@ -142,6 +157,8 @@ with read_base():

    from ...volc import infer as volc_infer  # noqa: F401, E501

+hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
+
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -175,10 +175,11 @@ class TestApibench:
 class TestVolcFullbench:
    """Test cases for chat model."""

-    @pytest.mark.parametrize(
-        'model, dataset',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
-         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
+        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
+        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
+    ] for p2 in dataset_list(p1, 'objective')])
    @pytest.mark.chat_objective
    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
@ -245,10 +246,7 @@ class TestCmdCase:
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                              ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
-                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -260,9 +258,9 @@ class TestCmdCase:
        [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -280,13 +278,25 @@ class TestCmdCase:

    @pytest.mark.case4
    @pytest.mark.parametrize(
-        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(model, result_score, base_score, dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
+
+    @pytest.mark.case5
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
+    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)


 def assert_score(model_type, score, baseline, dataset: str = ''):
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
    race-middle_accuracy: 91.78
    race-high_accuracy: 90.02

-internlm2-1.8b-hf:
-    demo_gsm8k_accuracy: 15.62
-    race-middle_accuracy: 71.66
-    race-high_accuracy: 66.38
-
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 89.06
+    demo_gsm8k_accuracy: 87.50
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.54

-internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k_accuracy: 31
-    race-middle_accuracy: 81.34
-    race-high_accuracy: 73.96
+internlm3-8b-instruct-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-vllm:
+    demo_gsm8k_accuracy: 81.25
+    race-middle_accuracy: 92.20
+    race-high_accuracy: 89.88

 internlm2_5-7b-chat_hf:
    demo_gsm8k_accuracy: 87.50
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
    race-high_accuracy: 90.48

 lmdeploy-api-test:
-    gsm8k_accuracy: 68.75
-    race-middle_accuracy: 87.50
+    gsm8k_accuracy: 56.25
+    race-middle_accuracy: 93.75
    race-high_accuracy: 93.75
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
        drop_accuracy: 81.25
        GPQA_diamond_accuracy: 25
        hellaswag_accuracy: 87.5
-        TheoremQA_score: 18.75
+        TheoremQA_score: 12.50
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
        gsm8k_accuracy: 62.50
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
        lcb_test_output_pass@1: 18.75
        bbh-logical_deduction_seven_objects_score: 50
        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 72.6
-        cmmlu-china-specific_naive_average: 76.25
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 76.25
        mmlu_pro_math_accuracy: 25
        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 20
+        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
-        CompassArena_naive_average: 44.00
+        CompassArena_naive_average: 43
        mtbench101_avg: 7.8
-        wildbench_average: -12.78
+        wildbench_average: -15.56
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_专业能力: 8.00
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -55,10 +55,10 @@ internlm2_5-7b-chat-hf_fullbench:
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 20
+        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
        compassarena_knowledge_naive_average: 55
-        compassarena_reason_v2_naive_average: 45.00
+        compassarena_reason_v2_naive_average: 40
        compassarena_math_v2_naive_average: 55
        compassarena_creationv2_zh_naive_average: 30
        followbench_llmeval_en_HSR_AVG: 1
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
 internlm2_5-7b-chat-turbomind_fullbench:
    objective:
        race-high_accuracy:  93.75
-        ARC-c_accuracy: 93.75
+        ARC-c_accuracy: 87.50
        BoolQ_accuracy: 68.75
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 56.25
-        drop_accuracy: 81.25
+        drop_accuracy: 75
        GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 81.25
-        TheoremQA_score: 6.25
+        hellaswag_accuracy: 87.5
+        TheoremQA_score: 12.5
        musr_average_naive_average: 39.58
-        korbench_single_naive_average: 37.50
-        gsm8k_accuracy: 68.75
-        math_accuracy: 68.75
+        korbench_single_naive_average: 40
+        gsm8k_accuracy: 62.5
+        math_accuracy: 75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
-        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 16.96
+        ds1000_naive_average: 17.86
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 25.00
-        bbh-logical_deduction_seven_objects_score: 50.00
-        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 69.71
-        cmmlu-china-specific_naive_average: 75.83
+        lcb_test_output_pass@1: 18.75
+        bbh-logical_deduction_seven_objects_score: 56.25
+        bbh-multistep_arithmetic_two_score: 75
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 78.33
        mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 0
+        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 18.75
+        ds1000_Scipy_accuracy: 25
        ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 6.25
        ds1000_Matplotlib_accuracy: 50.00
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.50
        college_knowledge_naive_average: 87.5
    subjective:
-        alignment_bench_v1_1_总分: 0.70
+        alignment_bench_v1_1_总分: 0.66
        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
-        CompassArena_naive_average: 38
-        mtbench101_avg: 7.80
-        wildbench_average: -4.86
+        CompassArena_naive_average: 40
+        mtbench101_avg: 8
+        wildbench_average: -6.81
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_专业能力: 7.9
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
        alignment_bench_v1_1_综合问答: 0
        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 50
-        compassarena_reason_v2_naive_average: 30
-        compassarena_math_v2_naive_average: 50
-        compassarena_creationv2_zh_naive_average: 25
+        compassarena_knowledge_naive_average: 45
+        compassarena_reason_v2_naive_average: 25
+        compassarena_math_v2_naive_average: 60
+        compassarena_creationv2_zh_naive_average: 35
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25
+        TheoremQA_score: 12.50
        winogrande_accuracy: 75
        gsm8k_accuracy: 37.5
        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25.00
+        TheoremQA_score: 12.50
        winogrande_accuracy: 87.5
-        gsm8k_accuracy: 62.50
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        gsm8k_accuracy: 56.25
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
        math_accuracy: 18.75
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 31.25
+        dingo_en_192_score: 50.00
        dingo_zh_170_score: 93.75
        mmlu-other_accuracy: 76.92
        cmmlu-china-specific_accuracy: 84.17
        mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 50
+        bbh-logical_deduction_seven_objects_score: 43.75
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
-        mmlu-stem_naive_average: 63.72
-        mmlu-social-science_naive_average: 80.15
-        mmlu-humanities_naive_average: 74.27
-        mmlu-other_naive_average: 71.85
-        cmmlu-stem_naive_average: 67.07
-        cmmlu-social-science_naive_average: 81.49
-        cmmlu-humanities_naive_average: 85.84
-        cmmlu-other_naive_average: 82.69
-        cmmlu-china-specific_naive_average: 79.88
+        mmlu-stem_accuracy: 63.72
+        mmlu-social-science_accuracy: 80.15
+        mmlu-humanities_accuracy: 74.27
+        mmlu-other_accuracy: 71.85
+        cmmlu-stem_accuracy: 67.07
+        cmmlu-social-science_accuracy: 81.49
+        cmmlu-humanities_accuracy: 85.84
+        cmmlu-other_accuracy: 82.69
+        cmmlu-china-specific_accuracy: 79.88
        mmlu_pro_biology_accuracy: 58.58
        mmlu_pro_business_accuracy: 28.01
        mmlu_pro_chemistry_accuracy: 22.79
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
        longbench_naive_average: 46.19
        longbench_zh_naive_average: 49.3
        longbench_en_naive_average: 43.97
-        longbench_single-document-qa_naive_average: 42.84
-        longbench_multi-document-qa_naive_average: 37.29
-        longbench_summarization_naive_average: 23.21
-        longbench_few-shot-learning_naive_average: 61.67
-        longbench_synthetic-tasks_naive_average: 60.05
-        longbench_code-completion_naive_average: 52.09
+        longbench_single-document-qa_score: 42.84
+        longbench_multi-document-qa_score: 41.25
+        longbench_summarization_score: 23.21
+        longbench_few-shot-learning_score: 61.67
+        longbench_synthetic-tasks_score: 60.05
+        longbench_code-completion_score: 52.09

 internlm2_5-7b-chat-turbomind:
    objective:
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
        teval_naive_average: 80
        SciCode_sub_accuracy: 5.56
        qa_dingo_cn_score: 99.01
-        mmlu-stem_naive_average: 68.2
-        mmlu-social-science_naive_average: 75.8
-        mmlu-humanities_naive_average: 69.3
-        mmlu-other_naive_average: 71.3
-        cmmlu-stem_naive_average: 66.64
-        cmmlu-social-science_naive_average: 76
-        cmmlu-humanities_naive_average: 77.9
-        cmmlu-other_naive_average: 77.25
-        cmmlu-china-specific_naive_average: 73.6
+        mmlu-stem_accuracy: 68.2
+        mmlu-social-science_accuracy: 75.8
+        mmlu-humanities_accuracy: 69.3
+        mmlu-other_accuracy: 71.3
+        cmmlu-stem_accuracy: 66.64
+        cmmlu-social-science_accuracy: 76
+        cmmlu-humanities_accuracy: 77.9
+        cmmlu-other_accuracy: 77.25
+        cmmlu-china-specific_accuracy: 73.6
        mmlu_pro_biology_accuracy: 66.67
        mmlu_pro_business_accuracy: 47.91
        mmlu_pro_chemistry_accuracy: 35
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
        openai_mmmlu_lite_DE-DE_accuracy: 51.27
        openai_mmmlu_lite_ES-LA_accuracy: 56.94
        openai_mmmlu_lite_FR-FR_accuracy: 58.22
-        openai_mmmlu_lite_HI-IN_accuracy: 33.75
+        openai_mmmlu_lite_HI-IN_accuracy: 30.75
        openai_mmmlu_lite_ID-ID_accuracy: 50.6
        openai_mmmlu_lite_IT-IT_accuracy: 50.6
        openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -391,10 +391,10 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_total: 25.96
        arenahard_score: 17.15
        Followbench_naive_average: 0.81
-        CompassArena_naive_average: 34.61
+        CompassArena_naive_average: 39.49
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
-        wildbench_average: -15.69
+        wildbench_average: -10.49
        simpleqa_accuracy_given_attempted: 0.04
        chinese_simpleqa_given_attempted_accuracy: 0.34
        alignment_bench_v1_1_专业能力: 6.05
@ -409,12 +409,12 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 25
-        compassarena_language_naive_average: 52.5
+        alpaca_eval_vicuna: 33.75
+        compassarena_language_naive_average: 58.50
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 19.91
-        compassarena_creationv2_zh_naive_average: 35.81
+        compassarena_math_v2_naive_average: 25.95
+        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
        followbench_llmeval_en_HSR_AVG: 0.73
@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
        babilong_32k_naive_average: 48.9
        babilong_128k_naive_average: 40.8
        babilong_256k_naive_average: 23.5
-        longbench_single-document-qa_naive_average: 43.56
-        longbench_multi-document-qa_naive_average: 46.24
-        longbench_summarization_naive_average: 24.32
-        longbench_few-shot-learning_naive_average: 51.67
-        longbench_synthetic-tasks_naive_average: 66.83
-        longbench_code-completion_naive_average: 45.99
+        longbench_single-document-qa_score: 43.56
+        longbench_multi-document-qa_score: 46.24
+        longbench_summarization_score: 24.32
+        longbench_few-shot-learning_score: 51.67
+        longbench_synthetic-tasks_score: 66.83
+        longbench_code-completion_score: 45.99
+
+
+qwen2.5-7b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 84.99
+        ARC-c_accuracy: 92.2
+        BoolQ_accuracy: 86.7
+        triviaqa_wiki_1shot_score: 53.06
+        nq_open_1shot_score: 17.51
+        mmmlu_lite_naive_average: 54.96
+        IFEval_Prompt-level-strict-accuracy: 71.53
+        drop_accuracy: 80.07
+        bbh_naive_average: 68.81
+        GPQA_diamond_accuracy: 34.34
+        hellaswag_accuracy: 85.42
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.44
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 92.57
+        GaokaoBench_weighted_average: 80.14
+        math_accuracy: 73.58
+        cmo_fib_accuracy: 25
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 77.33
+        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
+        cmmlu_naive_average: 75.97
+        mmlu_naive_average: 76.01
+        mmlu_pro_naive_average: 56.12
+        openai_humaneval_humaneval_pass@1: 83.54
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.29
+        ds1000_naive_average: 18.66
+        lcb_code_generation_pass@1: 39.5
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.68
+        bigcodebench_hard_instruct_pass@1: 16.22
+        bigcodebench_hard_complete_pass@1: 11.49
+        teval_naive_average: 79.72
+        SciCode_sub_accuracy: 10.76
+        qa_dingo_cn_score: 99.01
+        mmlu_accuracy: 76.01
+        mmlu-stem_accuracy: 77.59
+        mmlu-social-science_accuracy: 79.02
+        mmlu-humanities_accuracy: 72.07
+        mmlu-other_accuracy: 74.86
+        cmmlu_accuracy: 75.97
+        cmmlu-stem_accuracy: 73.09
+        cmmlu-social-science_accuracy: 75.95
+        cmmlu-humanities_accuracy: 76.53
+        cmmlu-other_accuracy: 78.79
+        cmmlu-china-specific_accuracy: 73.17
+        mmlu_pro_accuracy: 56.12
+        mmlu_pro_biology_accuracy: 71.41
+        mmlu_pro_business_accuracy: 67.68
+        mmlu_pro_chemistry_accuracy: 54.59
+        mmlu_pro_computer_science_accuracy: 58.29
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 42.41
+        mmlu_pro_health_accuracy: 55.87
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 28.97
+        mmlu_pro_math_accuracy: 73.13
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 58.43
+        mmlu_pro_psychology_accuracy: 63.16
+        mmlu_pro_other_accuracy: 53.57
+        humanevalx-python_pass@1: 50
+        humanevalx-cpp_pass@1: 42.07
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 75
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.18
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 10.43
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 60.65
+        mmmlu_lite_accuracy: 54.96
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.25
+        openai_mmmlu_lite_DE-DE_accuracy: 59.93
+        openai_mmmlu_lite_ES-LA_accuracy: 66.53
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 49.26
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.47
+        openai_mmmlu_lite_JA-JP_accuracy: 61.54
+        openai_mmmlu_lite_KO-KR_accuracy: 60.28
+        openai_mmmlu_lite_PT-BR_accuracy: 55.51
+        openai_mmmlu_lite_SW-KE_accuracy: 36.42
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
+        college_naive_average: 48
+        high_naive_average: 59
+        middle_naive_average: 78
+        primary_naive_average: 85.67
+        arithmetic_naive_average: 75.67
+        mathbench-a (average)_naive_average: 69.27
+        college_knowledge_naive_average: 83.86
+        high_knowledge_naive_average: 80.29
+        middle_knowledge_naive_average: 84.26
+        primary_knowledge_naive_average: 93.16
+        mathbench-t (average)_naive_average: 85.39
+
+
+
+
+internlm2_5-7b-chat-pytorch:
+    objective:
+        race-high_accuracy: 86.39
+        ARC-c_accuracy: 90.51
+        BoolQ_accuracy: 88.01
+        triviaqa_wiki_1shot_score: 64.77
+        nq_open_1shot_score: 22.71
+        mmmlu_lite_naive_average: 45.02
+        IFEval_Prompt-level-strict-accuracy: 56.56
+        drop_accuracy: 75.46
+        bbh_naive_average: 73.34
+        GPQA_diamond_accuracy: 32.83
+        hellaswag_accuracy: 94.81
+        TheoremQA_score: 23.88
+        musr_average_naive_average: 51.31
+        korbench_single_naive_average: 32
+        ARC_Prize_Public_Evaluation_accuracy: 0.01
+        gsm8k_accuracy: 86.96
+        GaokaoBench_weighted_average: 78.05
+        math_accuracy: 60.34
+        cmo_fib_accuracy: 12.98
+        aime2024_accuracy: 3.33
+        Mathbench_naive_average: 64.82
+        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
+        cmmlu_naive_average: 74.24
+        mmlu_naive_average: 70.2
+        mmlu_pro_naive_average: 45.39
+        openai_humaneval_humaneval_pass@1: 70.12
+        sanitized_mbpp_score: 64.59
+        humanevalx_naive_average: 38.78
+        ds1000_naive_average: 14.19
+        lcb_code_generation_pass@1: 16.5
+        lcb_code_execution_pass@1: 33.82
+        lcb_test_output_pass@1: 22.62
+        bigcodebench_hard_instruct_pass@1: 6.08
+        bigcodebench_hard_complete_pass@1: 6.76
+        teval_naive_average: 79.73
+        SciCode_sub_accuracy: 3.47
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 70.2
+        mmlu-stem_accuracy: 67.73
+        mmlu-social-science_accuracy: 75.49
+        mmlu-humanities_accuracy: 68.56
+        mmlu-other_accuracy: 70.58
+        cmmlu_accuracy: 74.24
+        cmmlu-stem_accuracy: 66.7
+        cmmlu-social-science_accuracy: 75.88
+        cmmlu-humanities_accuracy: 77.56
+        cmmlu-other_accuracy: 77.52
+        cmmlu-china-specific_accuracy: 73.46
+        mmlu_pro_accuracy: 45.39
+        mmlu_pro_biology_accuracy: 65.83
+        mmlu_pro_business_accuracy: 51.96
+        mmlu_pro_chemistry_accuracy: 36.84
+        mmlu_pro_computer_science_accuracy: 48.29
+        mmlu_pro_economics_accuracy: 56.16
+        mmlu_pro_engineering_accuracy: 29.1
+        mmlu_pro_health_accuracy: 44.5
+        mmlu_pro_history_accuracy: 42.26
+        mmlu_pro_law_accuracy: 24.98
+        mmlu_pro_math_accuracy: 54.85
+        mmlu_pro_philosophy_accuracy: 39.28
+        mmlu_pro_physics_accuracy: 37.41
+        mmlu_pro_psychology_accuracy: 58.27
+        mmlu_pro_other_accuracy: 45.78
+        humanevalx-python_pass@1: 56.1
+        humanevalx-cpp_pass@1: 20.73
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 59.15
+        humanevalx-js_pass@1: 57.93
+        ds1000_Pandas_accuracy: 8.93
+        ds1000_Numpy_accuracy: 4.09
+        ds1000_Tensorflow_accuracy: 11.11
+        ds1000_Scipy_accuracy: 7.55
+        ds1000_Sklearn_accuracy: 7.83
+        ds1000_Pytorch_accuracy: 8.82
+        ds1000_Matplotlib_accuracy: 50.97
+        mmmlu_lite_accuracy: 45.02
+        openai_mmmlu_lite_AR-XY_accuracy: 18.6
+        openai_mmmlu_lite_BN-BD_accuracy: 27.58
+        openai_mmmlu_lite_DE-DE_accuracy: 51.23
+        openai_mmmlu_lite_ES-LA_accuracy: 56.63
+        openai_mmmlu_lite_FR-FR_accuracy: 58.11
+        openai_mmmlu_lite_HI-IN_accuracy: 33.82
+        openai_mmmlu_lite_ID-ID_accuracy: 50.39
+        openai_mmmlu_lite_IT-IT_accuracy: 50.39
+        openai_mmmlu_lite_JA-JP_accuracy: 50.95
+        openai_mmmlu_lite_KO-KR_accuracy: 45.05
+        openai_mmmlu_lite_PT-BR_accuracy: 57.89
+        openai_mmmlu_lite_SW-KE_accuracy: 32.14
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
+        college_naive_average: 21
+        high_naive_average: 47
+        middle_naive_average: 59.67
+        primary_naive_average: 76
+        arithmetic_naive_average: 62
+        mathbench-a (average)_naive_average: 53.13
+        college_knowledge_naive_average: 68.99
+        high_knowledge_naive_average: 70.06
+        middle_knowledge_naive_average: 78.53
+        primary_knowledge_naive_average: 88.49
+        mathbench-t (average)_naive_average: 76.51
+
+
+qwen2.5-7b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 85.16
+        ARC-c_accuracy: 90.85
+        BoolQ_accuracy: 86.61
+        triviaqa_wiki_1shot_score: 52.96
+        nq_open_1shot_score: 17.62
+        mmmlu_lite_naive_average: 54.7
+        IFEval_Prompt-level-strict-accuracy: 71.35
+        drop_accuracy: 80.23
+        bbh_naive_average: 68.88
+        GPQA_diamond_accuracy: 36.36
+        hellaswag_accuracy: 85.49
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.3
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 91.66
+        GaokaoBench_weighted_average: 80.02
+        math_accuracy: 73.74
+        cmo_fib_accuracy: 26.44
+        aime2024_accuracy: 13.33
+        Mathbench_naive_average: 77.08
+        wikibench-wiki-single_choice_cncircular_perf_4: 34
+        cmmlu_naive_average: 75.9
+        mmlu_naive_average: 76.27
+        mmlu_pro_naive_average: 56.14
+        openai_humaneval_humaneval_pass@1: 84.76
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.17
+        ds1000_naive_average: 18.57
+        lcb_code_generation_pass@1: 38.75
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.45
+        bigcodebench_hard_instruct_pass@1: 16.89
+        bigcodebench_hard_complete_pass@1: 12.16
+        teval_naive_average: 79.46
+        SciCode_sub_accuracy: 10.42
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.27
+        mmlu-stem_accuracy: 77.75
+        mmlu-social-science_accuracy: 78.65
+        mmlu-humanities_accuracy: 73.12
+        mmlu-other_accuracy: 75.05
+        cmmlu_accuracy: 75.9
+        cmmlu-stem_accuracy: 73.41
+        cmmlu-social-science_accuracy: 75.97
+        cmmlu-humanities_accuracy: 76.42
+        cmmlu-other_accuracy: 78.15
+        cmmlu-china-specific_accuracy: 73.27
+        mmlu_pro_accuracy: 56.14
+        mmlu_pro_biology_accuracy: 72.25
+        mmlu_pro_business_accuracy: 66.16
+        mmlu_pro_chemistry_accuracy: 55.65
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 41.38
+        mmlu_pro_health_accuracy: 54.89
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 29.06
+        mmlu_pro_math_accuracy: 73.58
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 60.05
+        mmlu_pro_psychology_accuracy: 61.9
+        mmlu_pro_other_accuracy: 52.6
+        humanevalx-python_pass@1: 51.83
+        humanevalx-cpp_pass@1: 42.68
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 73.78
+        humanevalx-js_pass@1: 72.56
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.64
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 8.7
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 61.29
+        mmmlu_lite_accuracy: 54.7
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.18
+        openai_mmmlu_lite_DE-DE_accuracy: 60
+        openai_mmmlu_lite_ES-LA_accuracy: 66.18
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 48.63
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.26
+        openai_mmmlu_lite_JA-JP_accuracy: 60.7
+        openai_mmmlu_lite_KO-KR_accuracy: 60.63
+        openai_mmmlu_lite_PT-BR_accuracy: 54.46
+        openai_mmmlu_lite_SW-KE_accuracy: 36
+        openai_mmmlu_lite_YO-NG_accuracy: 31.86
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
+        college_naive_average: 48.33
+        high_naive_average: 59.33
+        middle_naive_average: 76.67
+        primary_naive_average: 86.67
+        arithmetic_naive_average: 74.33
+        mathbench-a (average)_naive_average: 69.07
+        college_knowledge_naive_average: 83.54
+        high_knowledge_naive_average: 80.82
+        middle_knowledge_naive_average: 83.79
+        primary_knowledge_naive_average: 92.22
+        mathbench-t (average)_naive_average: 85.1
+
+
+internlm3-8b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 89.22
+        ARC-c_accuracy: 92.54
+        BoolQ_accuracy: 86.45
+        triviaqa_wiki_1shot_score: 60.72
+        nq_open_1shot_score: 20.25
+        mmmlu_lite_naive_average: 41.82
+        IFEval_Prompt-level-strict-accuracy: 77.45
+        drop_accuracy: 83.27
+        bbh_naive_average: 55.22
+        GPQA_diamond_accuracy: 37.88
+        hellaswag_accuracy: 91.28
+        TheoremQA_score: 20.12
+        musr_average_naive_average: 36.86
+        korbench_single_naive_average: 41.2
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 91.28
+        GaokaoBench_weighted_average: 86.59
+        math_accuracy: 76.96
+        cmo_fib_accuracy: 35.1
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 78.96
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
+        cmmlu_naive_average: 83.33
+        mmlu_naive_average: 76.21
+        mmlu_pro_naive_average: 57.96
+        openai_humaneval_humaneval_pass@1: 81.71
+        sanitized_mbpp_score: 69.65
+        humanevalx_naive_average: 40.73
+        ds1000_naive_average: 27.23
+        lcb_code_generation_pass@1: 34.75
+        lcb_code_execution_pass@1: 49.9
+        lcb_test_output_pass@1: 48.19
+        bigcodebench_hard_instruct_pass@1: 13.51
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 11.11
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.21
+        mmlu-stem_accuracy: 77.7
+        mmlu-social-science_accuracy: 80.98
+        mmlu-humanities_accuracy: 70.83
+        mmlu-other_accuracy: 75.01
+        cmmlu_accuracy: 83.33
+        cmmlu-stem_accuracy: 79.66
+        cmmlu-social-science_accuracy: 83.39
+        cmmlu-humanities_accuracy: 84.73
+        cmmlu-other_accuracy: 86.2
+        cmmlu-china-specific_accuracy: 81.77
+        mmlu_pro_accuracy: 57.96
+        mmlu_pro_biology_accuracy: 75.45
+        mmlu_pro_business_accuracy: 64.64
+        mmlu_pro_chemistry_accuracy: 59.81
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 68.6
+        mmlu_pro_engineering_accuracy: 44.79
+        mmlu_pro_health_accuracy: 58.31
+        mmlu_pro_history_accuracy: 49.87
+        mmlu_pro_law_accuracy: 32.43
+        mmlu_pro_math_accuracy: 70.17
+        mmlu_pro_philosophy_accuracy: 46.89
+        mmlu_pro_physics_accuracy: 59.58
+        mmlu_pro_psychology_accuracy: 66.29
+        mmlu_pro_other_accuracy: 54.33
+        humanevalx-python_pass@1: 43.9
+        humanevalx-cpp_pass@1: 20.12
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 65.24
+        ds1000_Pandas_accuracy: 16.49
+        ds1000_Numpy_accuracy: 34.09
+        ds1000_Tensorflow_accuracy: 26.67
+        ds1000_Scipy_accuracy: 17.92
+        ds1000_Sklearn_accuracy: 20.87
+        ds1000_Pytorch_accuracy: 19.12
+        ds1000_Matplotlib_accuracy: 55.48
+        mmmlu_lite_accuracy: 41.82
+        openai_mmmlu_lite_AR-XY_accuracy: 32.56
+        openai_mmmlu_lite_BN-BD_accuracy: 4.56
+        openai_mmmlu_lite_DE-DE_accuracy: 24.91
+        openai_mmmlu_lite_ES-LA_accuracy: 51.09
+        openai_mmmlu_lite_FR-FR_accuracy: 61.68
+        openai_mmmlu_lite_HI-IN_accuracy: 24.98
+        openai_mmmlu_lite_ID-ID_accuracy: 44.56
+        openai_mmmlu_lite_IT-IT_accuracy: 52.35
+        openai_mmmlu_lite_JA-JP_accuracy: 51.02
+        openai_mmmlu_lite_KO-KR_accuracy: 47.93
+        openai_mmmlu_lite_PT-BR_accuracy: 53.89
+        openai_mmmlu_lite_SW-KE_accuracy: 33.47
+        openai_mmmlu_lite_YO-NG_accuracy: 33.47
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
+        college_naive_average: 45.67
+        high_naive_average: 64.67
+        middle_naive_average: 82.33
+        primary_naive_average: 90.33
+        arithmetic_naive_average: 74
+        mathbench-a (average)_naive_average: 71.4
+        college_knowledge_naive_average: 85.28
+        high_knowledge_naive_average: 79.43
+        middle_knowledge_naive_average: 87.9
+        primary_knowledge_naive_average: 93.42
+        mathbench-t (average)_naive_average: 86.51
+
+
+internlm3-8b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 89.02
+        ARC-c_accuracy: 93.56
+        BoolQ_accuracy: 86.67
+        triviaqa_wiki_1shot_score: 60.54
+        nq_open_1shot_score: 20.3
+        mmmlu_lite_naive_average: 42.6
+        IFEval_Prompt-level-strict-accuracy: 79.11
+        drop_accuracy: 83.32
+        bbh_naive_average: 54.76
+        GPQA_diamond_accuracy: 33.84
+        hellaswag_accuracy: 91.31
+        TheoremQA_score: 18
+        musr_average_naive_average: 36.62
+        korbench_single_naive_average: 41.84
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 90.67
+        GaokaoBench_weighted_average: 86.27
+        math_accuracy: 76.68
+        cmo_fib_accuracy: 33.65
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 78.92
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
+        cmmlu_naive_average: 83.11
+        mmlu_naive_average: 76.23
+        mmlu_pro_naive_average: 58.16
+        openai_humaneval_humaneval_pass@1: 82.32
+        sanitized_mbpp_score: 70.04
+        humanevalx_naive_average: 39.76
+        ds1000_naive_average: 27.84
+        lcb_code_generation_pass@1: 34.5
+        lcb_code_execution_pass@1: 48.02
+        lcb_test_output_pass@1: 47.74
+        bigcodebench_hard_instruct_pass@1: 12.84
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 9.38
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.23
+        mmlu-stem_accuracy: 78.08
+        mmlu-social-science_accuracy: 80.31
+        mmlu-humanities_accuracy: 71.38
+        mmlu-other_accuracy: 74.63
+        cmmlu_accuracy: 83.11
+        cmmlu-stem_accuracy: 79.42
+        cmmlu-social-science_accuracy: 83.34
+        cmmlu-humanities_accuracy: 83.95
+        cmmlu-other_accuracy: 86.22
+        cmmlu-china-specific_accuracy: 81.5
+        mmlu_pro_accuracy: 58.16
+        mmlu_pro_biology_accuracy: 74.62
+        mmlu_pro_business_accuracy: 65.02
+        mmlu_pro_chemistry_accuracy: 60.69
+        mmlu_pro_computer_science_accuracy: 61.46
+        mmlu_pro_economics_accuracy: 68.25
+        mmlu_pro_engineering_accuracy: 45.3
+        mmlu_pro_health_accuracy: 60.15
+        mmlu_pro_history_accuracy: 50.66
+        mmlu_pro_law_accuracy: 31.7
+        mmlu_pro_math_accuracy: 70.32
+        mmlu_pro_philosophy_accuracy: 47.7
+        mmlu_pro_physics_accuracy: 59.51
+        mmlu_pro_psychology_accuracy: 65.41
+        mmlu_pro_other_accuracy: 53.46
+        humanevalx-python_pass@1: 42.68
+        humanevalx-cpp_pass@1: 19.51
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 72.56
+        humanevalx-js_pass@1: 64.02
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 35
+        ds1000_Tensorflow_accuracy: 24.44
+        ds1000_Scipy_accuracy: 20.75
+        ds1000_Sklearn_accuracy: 21.74
+        ds1000_Pytorch_accuracy: 22.06
+        ds1000_Matplotlib_accuracy: 56.77
+        mmmlu_lite_accuracy: 42.6
+        openai_mmmlu_lite_AR-XY_accuracy: 32.84
+        openai_mmmlu_lite_BN-BD_accuracy: 10.46
+        openai_mmmlu_lite_DE-DE_accuracy: 24.56
+        openai_mmmlu_lite_ES-LA_accuracy: 50.95
+        openai_mmmlu_lite_FR-FR_accuracy: 61.05
+        openai_mmmlu_lite_HI-IN_accuracy: 30.6
+        openai_mmmlu_lite_ID-ID_accuracy: 45.89
+        openai_mmmlu_lite_IT-IT_accuracy: 51.79
+        openai_mmmlu_lite_JA-JP_accuracy: 51.65
+        openai_mmmlu_lite_KO-KR_accuracy: 48.77
+        openai_mmmlu_lite_PT-BR_accuracy: 52.7
+        openai_mmmlu_lite_SW-KE_accuracy: 32.91
+        openai_mmmlu_lite_YO-NG_accuracy: 32.84
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
+        college_naive_average: 47
+        high_naive_average: 66.67
+        middle_naive_average: 81.67
+        primary_naive_average: 89.33
+        arithmetic_naive_average: 73.67
+        mathbench-a (average)_naive_average: 71.67
+        college_knowledge_naive_average: 82.91
+        high_knowledge_naive_average: 79.86
+        middle_knowledge_naive_average: 88.92
+        primary_knowledge_naive_average: 92.96
+        mathbench-t (average)_naive_average: 86.16
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -1,27 +1,30 @@
 chat:
    glm-4-9b-chat-hf:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
    glm-4-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    deepseek-7b-chat-hf:
        gsm8k_accuracy: 46.88
        race-high_accuracy: 81.25
-    deepseek-moe-16b-chat-hf:
-        gsm8k_accuracy: 50
-        race-high_accuracy: 68.75
+    deepseek-r1-distill-llama-8b-turbomind:
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 81.25
+    deepseek-r1-distill-qwen-1_5b-turbomind:
+        gsm8k_accuracy: 37.5
+        race-high_accuracy: 53.12
    deepseek-7b-chat-vllm:
        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
+        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
-        race-high_accuracy: 71.88
+        race-high_accuracy: 75
    gemma2-9b-it-hf:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2b-it-hf:
        gsm8k_accuracy: 3.12
@ -36,34 +39,40 @@ chat:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
-        gsm8k_accuracy: 34.38
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
+    internlm3-8b-instruct-hf:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 87.5
    internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 87.50
+        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    internlm2-chat-1.8b-turbomind:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 84.38
    internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 84.38
    internlm2-chat-7b-lmdeploy:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 59.38
        race-high_accuracy: 84.38
    internlm2-chat-7b-sft-turbomind:
-        gsm8k_accuracy: 53.12
-        race-high_accuracy: 90.62
-    internlm2-chat-7b-vllm:
        gsm8k_accuracy: 56.25
-        race-high_accuracy: 84.38
+        race-high_accuracy: 90.62
+    internlm3-8b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 87.5
+    internlm2-chat-7b-vllm:
+        gsm8k_accuracy: 59.38
+        race-high_accuracy: 87.50
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 81.25
    llama-3-8b-instruct-hf:
        gsm8k_accuracy: 68.75
@ -72,14 +81,14 @@ chat:
        gsm8k_accuracy: 18.75
        race-high_accuracy: 46.88
    llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 81.25
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 81.25
    llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 71.88
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 84.38
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
@ -90,17 +99,14 @@ chat:
        gsm8k_accuracy: 75
        race-high_accuracy: 81.25
    mistral-nemo-instruct-2407-turbomind:
-        gsm8k_accuracy: 65.62
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 78.12
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
-        race-high_accuracy: 68.75
+        race-high_accuracy: 65.62
    mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
-    phi-3-mini-4k-instruct-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 21.88
+        race-high_accuracy: 78.12
    qwen2.5-0.5b-instruct-hf:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 46.88
@ -108,10 +114,10 @@ chat:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 28.12
-        race-high_accuracy: 50
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 43.75
    qwen2.5-3b-instruct-turbomind:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 90.62
    qwen1.5-0.5b-chat-hf:
        gsm8k_accuracy: 0
@ -123,11 +129,11 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 90.62
    qwen2-1.5b-instruct-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    qwen2-7b-instruct-turbomind:
        gsm8k_accuracy: 81.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
    qwen1.5-0.5b-chat-vllm:
        gsm8k_accuracy: 3.12
        race-high_accuracy: 53.12
@ -143,11 +149,11 @@ chat:
    yi-1.5-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
-    deepseek-v2-lite-chat-hf:
-        gsm8k_accuracy: 46.88
+    deepseek-v2_lite-chat-turbomind:
+        gsm8k_accuracy: 37.5
        race-high_accuracy: 71.88
    gemma2-27b-it-hf:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    internlm2_5-20b-chat-hf:
        gsm8k_accuracy: 84.38
@ -161,6 +167,9 @@ chat:
    mistral-small-instruct-2409-turbomind:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
+    phi-4:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
    qwen2.5-14b-instruct-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
@ -168,40 +177,41 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 93.75
    yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 93.75
-    deepseek-67b-chat-hf:
-        gsm8k_accuracy: 71.88
+    deepseek-67b-chat-turbomind:
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
+    deepseek-r1-distill-qwen-32b-turbomind:
+        gsm8k_accuracy: 25
+        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
-    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 56.25
-        race-high_accuracy: 81.25
    mixtral-large-instruct-2411-turbomind:
-        gsm8k_accuracy: 90.62
+        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 87.5
-        race-high_accuracy: 46.88
+        gsm8k_accuracy: 93.75
+        race-high_accuracy: 50.00
    qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 90.62
+    deepseek-r1-distill-llama-70b-turbomind:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 90.62
    deepseek-v2_5-1210-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 84.38
-    mixtral-8x22b-instruct-v0.1-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 81.25
+    mixtral-8x22b-instruct-v0.1-turbomind:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
+    mixtral-8x22b-instruct-v0.1-vllm:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
 base:
-    glm-4-9b-hf:
-        gsm8k_accuracy: 68.75
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
    glm-4-9b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
@ -210,15 +220,10 @@ base:
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 46.88
        winogrande_accuracy: 71.88
-    deepseek-moe-16b-base-hf:
-        gsm8k_accuracy: 21.88
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 21.88
-        winogrande_accuracy: 65.62
    deepseek-7b-base-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 18.75
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
        gsm8k_accuracy: 21.88
@ -226,35 +231,40 @@ base:
        race-high_accuracy: 25
        winogrande_accuracy: 68.75
    gemma2-2b-hf:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 56.25
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75.00
    gemma2-9b-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 84.38
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 81.25
    gemma-2b-hf:
-        gsm8k_accuracy: 18.75
+        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 25
+        race-high_accuracy: 21.88
        winogrande_accuracy: 53.12
    gemma-7b-hf:
        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 6.25
+        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 71.88
+    gemma-2-9b-turbomind:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 18.75
+        winogrande_accuracy: 46.88
    gemma-2b-vllm:
        gsm8k_accuracy: 15.62
        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy:
-        winogrande_accuracy:
+        race-high_accuracy: 28.12
+        winogrande_accuracy: 68.75
    gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy:
-        winogrande_accuracy:
+        gsm8k_accuracy: 43.75
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 81.25
    internlm2_5-7b-hf:
        gsm8k_accuracy: 37.5
        GPQA_diamond_accuracy: 25
@ -265,31 +275,26 @@ base:
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 62.5
        winogrande_accuracy: 78.12
-    internlm2-base-7b-hf:
-        gsm8k_accuracy: 3.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 75
-        winogrande_accuracy: 65.62
    internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 6.25
+        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 71.88
-        winogrande_accuracy: 78.12
-    internlm2_5-7b-turbomind:
-        gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 34.38
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 87.50
-    internlm2-7b-turbomind:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 71.88
-        winogrande_accuracy: 84.38
-    internlm2-base-7b-turbomind:
-        gsm8k_accuracy: 37.50
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
        winogrande_accuracy: 75
+    internlm2_5-7b-turbomind:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 87.5
+    internlm2-7b-turbomind:
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 71.88
+    internlm2-base-7b-turbomind:
+        gsm8k_accuracy: 28.12
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 71.88
+        winogrande_accuracy: 62.50
    llama-2-7b-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 21.88
@ -306,15 +311,15 @@ base:
        race-high_accuracy: 65.62
        winogrande_accuracy: 65.62
    llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 78.12
        winogrande_accuracy: 78.12
    llama-3-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 46.88
        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 81.25
    mistral-7b-v0.3-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 6.25
@ -326,15 +331,15 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    qwen2.5-1.5b-turbomind:
-        gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 12.50
-        race-high_accuracy: 78.12
-        winogrande_accuracy: 68.75
-    qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 75.00
-        GPQA_diamond_accuracy: 25
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 75
        winogrande_accuracy: 71.88
+    qwen2.5-7b-turbomind:
+        gsm8k_accuracy: 71.88
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 75.00
    qwen1.5-moe-a2.7b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 18.75
@ -356,20 +361,20 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 68.75
    qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    qwen2-7b-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75
    qwen1.5-0.5b-vllm:
        gsm8k_accuracy: 9.38
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 56.25
-        winogrande_accuracy: 62.5
+        winogrande_accuracy: 59.38
    yi-1.5-6b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
@ -384,25 +389,10 @@ base:
        gsm8k_accuracy: 78.12
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
-    deepseek-v2-lite-hf:
-        gsm8k_accuracy: 31.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 59.38
-        winogrande_accuracy: 71.88
-    internlm2-20b-hf:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 68.75
-        winogrande_accuracy: 75
-    internlm2-base-20b-hf:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy: 84.38
        winogrande_accuracy: 65.62
    internlm2-20b-turbomind:
        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 15.62
+        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 68.75
        winogrande_accuracy: 81.25
    qwen2.5-14b-hf:
@ -420,33 +410,23 @@ base:
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
-    deepseek-67b-base-hf:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 90.62
    deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 84.38
-    llama-3-70b-turbomind:
        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 81.25
+    llama-3-70b-turbomind:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    qwen2.5-72b-turbomind:
        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 34.38
+        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    deepseek-v2-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 75
-    llama-3-70b-hf:
-        gsm8k_accuracy: 62.5
-        GPQA_diamond_accuracy: 3.12
+        gsm8k_accuracy: 65.62
+        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -17,7 +17,7 @@ on:
        required: false
        description: 'whether to build lmdeploy'
        type:  boolean
-        default: false
+        default: true
      repo_org_lmdeploy:
        required: false
        description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -44,7 +44,7 @@ on:
        type: string
        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
  schedule:
-    - cron:  '15 14 * * *'
+    - cron:  '15 14 * * 0,3'

 env:
  HF_DATASETS_OFFLINE: 1
@ -61,6 +61,7 @@ env:
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  CONDA_ENV: regression_test
+  export VLLM_WORKER_MULTIPROC_METHOD: spawn

 jobs:
  build-pypi:
@ -87,12 +88,11 @@ jobs:
          name: my-artifact-${{ github.run_id }}

  build-pypi-lmdeploy:
-    if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
    strategy:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
-    environment: 'prod'
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
@ -126,8 +126,7 @@ jobs:
    if: ${{!cancelled()}}
    needs: ['build-pypi', 'build-pypi-lmdeploy']
    runs-on: volc_cu12
-    environment: 'prod'
-    timeout-minutes: 240 #4hours
+    timeout-minutes: 120 #2hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -148,7 +147,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 240
+          timeout_minutes: 120
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda create -y --name ${{env.CONDA_ENV}} python=3.10
@ -157,20 +156,23 @@ jobs:
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
+          pip uninstall -y lmdeploy
          pip install lmdeploy-*.whl --no-deps
      - name: conda env
        run: |
@ -187,8 +189,7 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
    runs-on: volc_cu12_daily
-    environment: 'prod'
-    timeout-minutes: 120 #2hours
+    timeout-minutes: 180 #3hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -210,7 +211,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 120
+          timeout_minutes: 180
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
@ -228,8 +229,7 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
    runs-on: volc_cu12_local
-    environment: 'prod'
-    timeout-minutes: 240 #4hours
+    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -255,27 +255,33 @@ jobs:
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
+          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api
        if: matrix.regression_func == 'api'
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
-          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 120s
+          sleep 180s
+          env | grep PROXY
+          env | grep proxy
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -304,8 +310,7 @@ jobs:
      matrix:
        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
    runs-on: volc_cu12
-    environment: 'prod'
-    timeout-minutes: 360 #6hours
+    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -322,7 +327,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 360
+          timeout_minutes: 480
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
@ -334,11 +339,10 @@ jobs:


  notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
-    environment: 'prod'
    steps:
      - name: notify
        run: |
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -45,7 +45,7 @@ jobs:
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: conda env
        run: |
--- a/.github/workflows/pr-stage-check.yml
+++ b/.github/workflows/pr-stage-check.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        python-version: ['3.10']
        include:
-          - torch: 2.0.0
+          - torch: 2.5.1
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
@ -30,7 +30,7 @@ jobs:
      - name: Upgrade pip
        run: python -m pip install --upgrade pip
      - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+        run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install system dependencies
        run: |
          sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
@ -106,7 +106,7 @@ jobs:
      - name: Upgrade pip
        run: python -m pip install pip --upgrade
      - name: Install PyTorch
-        run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+        run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install opencompass dependencies
        run: |
          pip install -r requirements.txt
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -120,4 +120,4 @@ repos:
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -120,4 +120,4 @@ repos:
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/README.md
+++ b/README.md
@ -57,6 +57,10 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
+- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
+- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
+- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
 - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
@ -173,69 +177,83 @@ Some third-party features, like Humaneval and Llama, may require additional step

 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!

- Your first evaluation with OpenCompass!
+### Your first evaluation with OpenCompass!

-  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
+OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.

-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen

-  # Python scripts
-  opencompass examples/eval_chat_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_chat_demo.py
+```

-  You can find more script examples under [examples](./examples) folder.
+You can find more script examples under [examples](./examples) folder.

- API evaluation
+### API evaluation

-  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.

-  ```bash
-  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
-  # CLI
-  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+```bash
+export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+# CLI
+opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen

-  # Python scripts
-  opencompass examples/eval_api_demo.py
+# Python scripts
+opencompass examples/eval_api_demo.py

-  # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
-  ```
+# You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
+```

- Accelerated Evaluation
+### Accelerated Evaluation

-  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:

-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy

-  # Python scripts
-  opencompass examples/eval_lmdeploy_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_lmdeploy_demo.py
+```

- Supported Models
+### Supported Models and Datasets

-  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).

-  ```bash
-  # List all configurations
-  python tools/list_configs.py
-  # List all configurations related to llama and mmlu
-  python tools/list_configs.py llama mmlu
-  ```
+```bash
+# List all configurations
+python tools/list_configs.py
+# List all configurations related to llama and mmlu
+python tools/list_configs.py llama mmlu
+```

-  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+#### Supported Models

-  ```bash
-  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
-  ```
+If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.

-  If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+```bash
+opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
+```

-  ```bash
-  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
-  ```
+#### Supported Datasets
+
+Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
+
+```bash
+# Recommended Evaluation Config based on Rules
+opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+# Recommended Evaluation Config based on LLM Judge
+opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+```
+
+If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
+```

 > \[!TIP\]
 >
@ -279,263 +297,15 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide

 ## 📖 Dataset Support

-<table align="center">
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Language</b>
-      </td>
-      <td>
-        <b>Knowledge</b>
-      </td>
-      <td>
-        <b>Reasoning</b>
-      </td>
-      <td>
-        <b>Examination</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>Word Definition</b></summary>
+We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.

- WiC
- SummEdits
+You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.

-</details>
+In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.

-<details open>
-<summary><b>Idiom Learning</b></summary>
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.

- CHID
-
-</details>
-
-<details open>
-<summary><b>Semantic Similarity</b></summary>
-
- AFQMC
- BUSTM
-
-</details>
-
-<details open>
-<summary><b>Coreference Resolution</b></summary>
-
- CLUEWSC
- WSC
- WinoGrande
-
-</details>
-
-<details open>
-<summary><b>Translation</b></summary>
-
- Flores
- IWSLT2017
-
-</details>
-
-<details open>
-<summary><b>Multi-language Question Answering</b></summary>
-
- TyDi-QA
- XCOPA
-
-</details>
-
-<details open>
-<summary><b>Multi-language Summary</b></summary>
-
- XLSum
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Knowledge Question Answering</b></summary>
-
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Textual Entailment</b></summary>
-
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
-
-</details>
-
-<details open>
-<summary><b>Commonsense Reasoning</b></summary>
-
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
-
-</details>
-
-<details open>
-<summary><b>Mathematical Reasoning</b></summary>
-
- MATH
- GSM8K
-
-</details>
-
-<details open>
-<summary><b>Theorem Application</b></summary>
-
- TheoremQA
- StrategyQA
- SciBench
-
-</details>
-
-<details open>
-<summary><b>Comprehensive Reasoning</b></summary>
-
- BBH
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
-
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
-
-</details>
-
-<details open>
-<summary><b>Medical Examinations</b></summary>
-
- CMB
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Understanding</b>
-      </td>
-      <td>
-        <b>Long Context</b>
-      </td>
-      <td>
-        <b>Safety</b>
-      </td>
-      <td>
-        <b>Code</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>Reading Comprehension</b></summary>
-
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
-
-</details>
-
-<details open>
-<summary><b>Content Summary</b></summary>
-
- CSL
- LCSTS
- XSum
- SummScreen
-
-</details>
-
-<details open>
-<summary><b>Content Analysis</b></summary>
-
- EPRSTMT
- LAMBADA
- TNEWS
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Long Context Understanding</b></summary>
-
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Safety</b></summary>
-
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
-
-</details>
-<details open>
-<summary><b>Robustness</b></summary>
-
- AdvGLUE
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Code</b></summary>
-
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-</table>
+<p align="right"><a href="#top">🔝Back to top</a></p>

 ## 📖 Model Support

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -57,6 +57,10 @@

 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
+- **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
+- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
+- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
@ -205,9 +209,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
  ```

-  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+- ### 支持的模型与数据集

- ### 支持的模型
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。

  ```bash
  # 列出所有配置
@ -216,13 +220,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  python tools/list_configs.py llama mmlu
  ```

-  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+  #### 支持的模型
+
+  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。

  ```bash
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```

-  如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
+  #### 支持的数据集
+
+  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
+
+  ```bash
+  # 基于规则的推荐配置
+  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+  # 基于LLM Judge的推荐配置
+  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+  ```
+
+  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。

  ```bash
  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@ -274,263 +292,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下

 ## 📖 数据集支持

-<table align="center">
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>语言</b>
-      </td>
-      <td>
-        <b>知识</b>
-      </td>
-      <td>
-        <b>推理</b>
-      </td>
-      <td>
-        <b>考试</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>字词释义</b></summary>
+我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。

- WiC
- SummEdits
+您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。

-</details>
-
-<details open>
-<summary><b>成语习语</b></summary>
-
- CHID
-
-</details>
-
-<details open>
-<summary><b>语义相似度</b></summary>
-
- AFQMC
- BUSTM
-
-</details>
-
-<details open>
-<summary><b>指代消解</b></summary>
-
- CLUEWSC
- WSC
- WinoGrande
-
-</details>
-
-<details open>
-<summary><b>翻译</b></summary>
-
- Flores
- IWSLT2017
-
-</details>
-
-<details open>
-<summary><b>多语种问答</b></summary>
-
- TyDi-QA
- XCOPA
-
-</details>
-
-<details open>
-<summary><b>多语种总结</b></summary>
-
- XLSum
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>知识问答</b></summary>
-
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>文本蕴含</b></summary>
-
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
-
-</details>
-
-<details open>
-<summary><b>常识推理</b></summary>
-
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
-
-</details>
-
-<details open>
-<summary><b>数学推理</b></summary>
-
- MATH
- GSM8K
-
-</details>
-
-<details open>
-<summary><b>定理应用</b></summary>
-
- TheoremQA
- StrategyQA
- SciBench
-
-</details>
-
-<details open>
-<summary><b>综合推理</b></summary>
-
- BBH
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>初中/高中/大学/职业考试</b></summary>
-
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
-
-</details>
-
-<details open>
-<summary><b>医学考试</b></summary>
-
- CMB
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>理解</b>
-      </td>
-      <td>
-        <b>长文本</b>
-      </td>
-      <td>
-        <b>安全</b>
-      </td>
-      <td>
-        <b>代码</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>阅读理解</b></summary>
-
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
-
-</details>
-
-<details open>
-<summary><b>内容总结</b></summary>
-
- CSL
- LCSTS
- XSum
- SummScreen
-
-</details>
-
-<details open>
-<summary><b>内容分析</b></summary>
-
- EPRSTMT
- LAMBADA
- TNEWS
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>长文本理解</b></summary>
-
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>安全</b></summary>
-
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
-
-</details>
-<details open>
-<summary><b>健壮性</b></summary>
-
- AdvGLUE
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>代码</b></summary>
-
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-</table>
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。

 <p align="right"><a href="#top">🔝返回顶部</a></p>

--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -0,0 +1,999 @@
+- ifeval:
+    name: IFEval
+    category: Instruction Following
+    paper: https://arxiv.org/pdf/2311.07911
+    configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
+    configpath_llmjudge: ''
+- nphard:
+    name: NPHardEval
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2312.14890v2
+    configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
+    configpath_llmjudge: ''
+- pmmeval:
+    name: PMMEval
+    category: Language
+    paper: https://arxiv.org/pdf/2411.09116v1
+    configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
+    configpath_llmjudge: ''
+- theoremqa:
+    name: TheroremQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2305.12524
+    configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
+    configpath_llmjudge: ''
+- agieval:
+    name: AGIEval
+    category: Examination
+    paper: https://arxiv.org/pdf/2304.06364
+    configpath: opencompass/configs/datasets/agieval/agieval_gen.py
+    configpath_llmjudge: ''
+- babilong:
+    name: BABILong
+    category: Long Context
+    paper: https://arxiv.org/pdf/2406.10149
+    configpath: opencompass/configs/datasets/babilong
+    configpath_llmjudge: ''
+- bigcodebench:
+    name: BigCodeBench
+    category: Code
+    paper: https://arxiv.org/pdf/2406.15877
+    configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
+    configpath_llmjudge: ''
+- calm:
+    name: CaLM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2405.00622
+    configpath: opencompass/configs/datasets/calm/calm.py
+    configpath_llmjudge: ''
+- infinitebench:
+    name: InfiniteBench (∞Bench)
+    category: Long Context
+    paper: https://aclanthology.org/2024.acl-long.814.pdf
+    configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
+    configpath_llmjudge: ''
+- korbench:
+    name: KOR-Bench
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2410.06526v1
+    configpath: opencompass/configs/datasets/korbench/korbench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
+- lawbench:
+    name: LawBench
+    category: Knowledge / Law
+    paper: https://arxiv.org/pdf/2309.16289
+    configpath:
+      - opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
+      - opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
+    configpath_llmjudge: ''
+- leval:
+    name: L-Eval
+    category: Long Context
+    paper: https://arxiv.org/pdf/2307.11088v1
+    configpath: opencompass/configs/datasets/leval/leval.py
+    configpath_llmjudge: ''
+- livecodebench:
+    name: LiveCodeBench
+    category: Code
+    paper: https://arxiv.org/pdf/2403.07974
+    configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
+    configpath_llmjudge: ''
+- livemathbench:
+    name: LiveMathBench
+    category: Math
+    paper: https://arxiv.org/pdf/2412.13147
+    configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
+    configpath_llmjudge: ''
+- livereasonbench:
+    name: LiveReasonBench
+    category: Reasoning
+    paper: ''
+    configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
+    configpath_llmjudge: ''
+- longbench:
+    name: LongBench
+    category: Long Context
+    paper: https://github.com/THUDM/LongBench
+    configpath:
+      - opencompass/configs/datasets/longbench/longbench.py
+      - opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
+    configpath_llmjudge: ''
+- lveval:
+    name: LV-Eval
+    category: Long Context
+    paper: https://arxiv.org/pdf/2402.05136
+    configpath: opencompass/configs/datasets/lveval/lveval.py
+    configpath_llmjudge: ''
+- mastermath2024v1:
+    name: Mastermath2024v1
+    category: Math
+    paper: ''
+    configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
+    configpath_llmjudge: ''
+- medbench:
+    name: MedBench
+    category: Knowledge / Medicine
+    paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
+    configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
+    configpath_llmjudge: ''
+- MedXpertQA:
+    name: MedXpertQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2501.18362
+    configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+- musr:
+    name: MuSR
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2310.16049
+    configpath: opencompass/configs/datasets/musr/musr_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
+- needlebench:
+    name: NeedleBench
+    category: Long Context
+    paper: https://arxiv.org/pdf/2407.11963
+    configpath: opencompass/configs/datasets/needlebench
+    configpath_llmjudge: ''
+- ruler:
+    name: RULER
+    category: Long Context
+    paper: https://arxiv.org/pdf/2404.06654
+    configpath: opencompass/configs/datasets/ruler
+    configpath_llmjudge: ''
+- alignment:
+    name: AlignBench
+    category: Subjective / Alignment
+    paper: https://arxiv.org/pdf/2311.18743
+    configpath: opencompass/configs/datasets/subjective/alignbench
+    configpath_llmjudge: ''
+- alpaca:
+    name: AlpacaEval
+    category: Subjective / Instruction Following
+    paper: https://github.com/tatsu-lab/alpaca_eval
+    configpath: opencompass/configs/datasets/subjective/aplaca_eval
+    configpath_llmjudge: ''
+- arenahard:
+    name: Arena-Hard
+    category: Subjective / Chatbot
+    paper: https://lmsys.org/blog/2024-04-19-arena-hard/
+    configpath: opencompass/configs/datasets/subjective/arena_hard
+    configpath_llmjudge: ''
+- flames:
+    name: FLAMES
+    category: Subjective / Alignment
+    paper: https://arxiv.org/pdf/2311.06899
+    configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
+    configpath_llmjudge: ''
+- fofo:
+    name: FOFO
+    category: Subjective / Format Following
+    paper: https://arxiv.org/pdf/2402.18667
+    configpath: opencompass/configs/datasets/subjective/fofo
+    configpath_llmjudge: ''
+- followbench:
+    name: FollowBench
+    category: Subjective / Instruction Following
+    paper: https://arxiv.org/pdf/2310.20410
+    configpath: opencompass/configs/datasets/subjective/followbench
+    configpath_llmjudge: ''
+- hellobench:
+    name: HelloBench
+    category: Subjective / Long Context
+    paper: https://arxiv.org/pdf/2409.16191
+    configpath: opencompass/configs/datasets/subjective/hellobench
+    configpath_llmjudge: ''
+- judgerbench:
+    name: JudgerBench
+    category: Subjective / Long Context
+    paper: https://arxiv.org/pdf/2410.16256
+    configpath: opencompass/configs/datasets/subjective/judgerbench
+    configpath_llmjudge: ''
+- multiround:
+    name: MT-Bench-101
+    category: Subjective / Multi-Round
+    paper: https://arxiv.org/pdf/2402.14762
+    configpath: opencompass/configs/datasets/subjective/multiround
+    configpath_llmjudge: ''
+- wildbench:
+    name: WildBench
+    category: Subjective / Real Task
+    paper: https://arxiv.org/pdf/2406.04770
+    configpath: opencompass/configs/datasets/subjective/wildbench
+    configpath_llmjudge: ''
+- teval:
+    name: T-Eval
+    category: Tool Utilization
+    paper: https://arxiv.org/pdf/2312.14033
+    configpath:
+      - opencompass/configs/datasets/teval/teval_en_gen.py
+      - opencompass/configs/datasets/teval/teval_zh_gen.py
+    configpath_llmjudge: ''
+- finalceiq:
+    name: FinanceIQ
+    category: Knowledge / Finance
+    paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
+    configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
+    configpath_llmjudge: ''
+- gaokaobench:
+    name: GAOKAOBench
+    category: Examination
+    paper: https://arxiv.org/pdf/2305.12474
+    configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
+    configpath_llmjudge: ''
+- lcbench:
+    name: LCBench
+    category: Code
+    paper: https://github.com/open-compass/CodeBench/
+    configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
+    configpath_llmjudge: ''
+- MMLUArabic:
+    name: ArabicMMLU
+    category: Language
+    paper: https://arxiv.org/pdf/2402.12840
+    configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
+    configpath_llmjudge: ''
+- OpenFinData:
+    name: OpenFinData
+    category: Knowledge / Finance
+    paper: https://github.com/open-compass/OpenFinData
+    configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
+    configpath_llmjudge: ''
+- QuALITY:
+    name: QuALITY
+    category: Long Context
+    paper: https://arxiv.org/pdf/2112.08608
+    configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
+    configpath_llmjudge: ''
+- advglue:
+    name: Adversarial GLUE
+    category: Safety
+    paper: https://openreview.net/pdf?id=GF9cSKI3A_q
+    configpath:
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
+    configpath_llmjudge: ''
+- afqmcd:
+    name: CLUE / AFQMC
+    category: Language
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
+    configpath_llmjudge: ''
+- aime2024:
+    name: AIME2024
+    category: Examination
+    paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
+    configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+- anli:
+    name: Adversarial NLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1910.14599v2
+    configpath: opencompass/configs/datasets/anli/anli_gen.py
+    configpath_llmjudge: ''
+- anthropics_evals:
+    name: Anthropics Evals
+    category: Safety
+    paper: https://arxiv.org/pdf/2212.09251
+    configpath:
+    - opencompass/configs/datasets/anthropics_evals/airisk_gen.py
+    - opencompass/configs/datasets/anthropics_evals/persona_gen.py
+    - opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
+    configpath_llmjudge: ''
+- apps:
+    name: APPS
+    category: Code
+    paper: https://arxiv.org/pdf/2105.09938
+    configpath:
+    - opencompass/configs/datasets/apps/apps_gen.py
+    - opencompass/configs/datasets/apps/apps_mini_gen.py
+    configpath_llmjudge: ''
+- arc:
+    name: ARC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1803.05457
+    configpath:
+    - opencompass/configs/datasets/ARC_c/ARC_c_gen.py
+    - opencompass/configs/datasets/ARC_e/ARC_e_gen.py
+    configpath_llmjudge: ''
+- arc_prize_public_eval:
+    name: ARC Prize
+    category: ARC-AGI
+    paper: https://arcprize.org/guide#private
+    configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+    configpath_llmjudge: ''
+- ax:
+    name: SuperGLUE / AX
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath:
+    - opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
+    - opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
+    configpath_llmjudge: ''
+- bbh:
+    name: BIG-Bench Hard
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2210.09261
+    configpath: opencompass/configs/datasets/bbh/bbh_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
+- bbeh:
+    name: BIG-Bench Extra Hard
+    category: Reasoning
+    paper: https://arxiv.org/abs/2502.19187
+    configpath: opencompass/configs/datasets/bbeh
+    configpath_llmjudge: ''
+- BoolQ:
+    name: SuperGLUE / BoolQ
+    category: Knowledge
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
+    configpath_llmjudge: ''
+- c3:
+    name: CLUE / C3 (C³)
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
+    configpath_llmjudge: ''
+- cb:
+    name: SuperGLUE / CB
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
+    configpath_llmjudge: ''
+- ceval:
+    name: C-EVAL
+    category: Examination
+    paper: https://arxiv.org/pdf/2305.08322v1
+    configpath: opencompass/configs/datasets/ceval/ceval_gen.py
+    configpath_llmjudge: ''
+- charm:
+    name: CHARM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2403.14112
+    configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
+    configpath_llmjudge: ''
+- chembench:
+    name: ChemBench
+    category: Knowledge / Chemistry
+    paper: https://arxiv.org/pdf/2404.01475
+    configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
+    configpath_llmjudge: ''
+- chid:
+    name: FewCLUE / CHID
+    category: Language
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
+    configpath_llmjudge: ''
+- chinese_simpleqa:
+    name: Chinese SimpleQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2411.07140
+    configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
+    configpath_llmjudge: ''
+- cibench:
+    name: CIBench
+    category: Code
+    paper: https://www.arxiv.org/pdf/2407.10499
+    configpath:
+      - opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
+      - opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
+      - opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
+    configpath_llmjudge: ''
+- civilcomments:
+    name: CivilComments
+    category: Safety
+    paper: https://arxiv.org/pdf/1903.04561
+    configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
+    configpath_llmjudge: ''
+- clozeTest_maxmin:
+    name: Cloze Test-max/min
+    category: Code
+    paper: https://arxiv.org/pdf/2102.04664
+    configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
+    configpath_llmjudge: ''
+- cluewsc:
+    name: FewCLUE / CLUEWSC
+    category: Language / WSC
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
+    configpath_llmjudge: ''
+- cmb:
+    name: CMB
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/pdf/2308.08833
+    configpath: opencompass/configs/datasets/cmb/cmb_gen.py
+    configpath_llmjudge: ''
+- cmmlu:
+    name: CMMLU
+    category: Understanding
+    paper: https://arxiv.org/pdf/2306.09212
+    configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
+- cmnli:
+    name: CLUE / CMNLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
+    configpath_llmjudge: ''
+- cmo_fib:
+    name: cmo_fib
+    category: Examination
+    paper: ''
+    configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
+    configpath_llmjudge: ''
+- cmrc:
+    name: CLUE / CMRC
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
+    configpath_llmjudge: ''
+- commonsenseqa:
+    name: CommonSenseQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1811.00937v2
+    configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
+    configpath_llmjudge: ''
+- commonsenseqa_cn:
+    name: CommonSenseQA-CN
+    category: Knowledge
+    paper: ''
+    configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
+    configpath_llmjudge: ''
+- copa:
+    name: SuperGLUE / COPA
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
+    configpath_llmjudge: ''
+- crowspairs:
+    name: CrowsPairs
+    category: Safety
+    paper: https://arxiv.org/pdf/2010.00133
+    configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
+    configpath_llmjudge: ''
+- crowspairs_cn:
+    name: CrowsPairs-CN
+    category: Safety
+    paper: ''
+    configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
+    configpath_llmjudge: ''
+- cvalues:
+    name: CVALUES
+    category: Safety
+    paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
+    configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
+    configpath_llmjudge: ''
+- drcd:
+    name: CLUE / DRCD
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
+    configpath_llmjudge: ''
+- drop:
+    name: DROP (DROP Simple Eval)
+    category: Understanding
+    paper: https://arxiv.org/pdf/1903.00161
+    configpath: opencompass/configs/datasets/drop/drop_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
+- ds1000:
+    name: DS-1000
+    category: Code
+    paper: https://arxiv.org/pdf/2211.11501
+    configpath:
+    - opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
+    configpath_llmjudge: ''
+- eprstmt:
+    name: FewCLUE / EPRSTMT
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
+    configpath_llmjudge: ''
+- flores:
+    name: Flores
+    category: Language
+    paper: https://aclanthology.org/D19-1632.pdf
+    configpath: opencompass/configs/datasets/flores/flores_gen.py
+    configpath_llmjudge: ''
+- game24:
+    name: Game24
+    category: Math
+    paper: https://huggingface.co/datasets/nlile/24-game
+    configpath: opencompass/configs/datasets/game24/game24_gen.py
+    configpath_llmjudge: ''
+- govrepcrs:
+    name: Government Report Dataset
+    category: Long Context
+    paper: https://aclanthology.org/2021.naacl-main.112.pdf
+    configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
+    configpath_llmjudge: ''
+- gpqa:
+    name: GPQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2311.12022v1
+    configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
+- gsm8k:
+    name: GSM8K
+    category: Math
+    paper: https://arxiv.org/pdf/2110.14168v2
+    configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
+    configpath_llmjudge: ''
+- gsm_hard:
+    name: GSM-Hard
+    category: Math
+    paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
+    configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
+    configpath_llmjudge: ''
+- hle:
+    name: HLE(Humanity's Last Exam)
+    category: Reasoning
+    paper: https://lastexam.ai/paper
+    configpath: opencompass/configs/datasets/HLE/hle_gen.py
+    configpath_llmjudge: ''
+- hellaswag:
+    name: HellaSwag
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1905.07830
+    configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
+- humaneval:
+    name: HumanEval
+    category: Code
+    paper: https://arxiv.org/pdf/2107.03374v2
+    configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
+    configpath_llmjudge: ''
+- humaneval_cn:
+    name: HumanEval-CN
+    category: Code
+    paper: ''
+    configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
+    configpath_llmjudge: ''
+- humaneval_multi:
+    name: Multi-HumanEval
+    category: Code
+    paper: https://arxiv.org/pdf/2210.14868
+    configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
+    configpath_llmjudge: ''
+- humaneval_multi:
+    name: HumanEval+
+    category: Code
+    paper: https://arxiv.org/pdf/2305.01210
+    configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
+    configpath_llmjudge: ''
+- humanevalx:
+    name: HumanEval-X
+    category: Code
+    paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
+    configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
+    configpath_llmjudge: ''
+- hungarian_math:
+    name: Hungarian_Math
+    category: Math
+    paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
+    configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
+    configpath_llmjudge: ''
+- iwslt2017:
+    name: IWSLT2017
+    category: Language
+    paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
+    configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
+    configpath_llmjudge: ''
+- jigsawmultilingual:
+    name: JigsawMultilingual
+    category: Safety
+    paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
+    configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
+    configpath_llmjudge: ''
+- lambada:
+    name: LAMBADA
+    category: Understanding
+    paper: https://arxiv.org/pdf/1606.06031
+    configpath: opencompass/configs/datasets/lambada/lambada_gen.py
+    configpath_llmjudge: ''
+- lcsts:
+    name: LCSTS
+    category: Understanding
+    paper: https://aclanthology.org/D15-1229.pdf
+    configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
+    configpath_llmjudge: ''
+- livestembench:
+    name: LiveStemBench
+    category: ''
+    paper: ''
+    configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
+    configpath_llmjudge: ''
+- llm_compression:
+    name: LLM Compression
+    category: Bits Per Character (BPC)
+    paper: https://arxiv.org/pdf/2404.09937
+    configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
+    configpath_llmjudge: ''
+- math:
+    name: MATH
+    category: Math
+    paper: https://arxiv.org/pdf/2103.03874
+    configpath: opencompass/configs/datasets/math/math_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
+- math500:
+    name: MATH500
+    category: Math
+    paper: https://github.com/openai/prm800k
+    configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
+- math401:
+    name: MATH 401
+    category: Math
+    paper: https://arxiv.org/pdf/2304.02015
+    configpath: opencompass/configs/datasets/math401/math401_gen.py
+    configpath_llmjudge: ''
+- mathbench:
+    name: MathBench
+    category: Math
+    paper: https://arxiv.org/pdf/2405.12209
+    configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
+    configpath_llmjudge: ''
+- mbpp:
+    name: MBPP
+    category: Code
+    paper: https://arxiv.org/pdf/2108.07732
+    configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
+    configpath_llmjudge: ''
+- mbpp_cn:
+    name: MBPP-CN
+    category: Code
+    paper: ''
+    configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
+    configpath_llmjudge: ''
+- mbpp_plus:
+    name: MBPP-PLUS
+    category: Code
+    paper: ''
+    configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
+    configpath_llmjudge: ''
+- mgsm:
+    name: MGSM
+    category: Language / Math
+    paper: https://arxiv.org/pdf/2210.03057
+    configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
+    configpath_llmjudge: ''
+- mmlu:
+    name: MMLU
+    category: Understanding
+    paper: https://arxiv.org/pdf/2009.03300
+    configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+- mmlu_cf:
+    name: MMLU-CF
+    category: Understanding
+    paper: https://arxiv.org/pdf/2412.15194
+    configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
+    configpath_llmjudge: ''
+- mmlu_pro:
+    name: MMLU-Pro
+    category: Understanding
+    paper: https://arxiv.org/pdf/2406.01574
+    configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
+- mmmlu:
+    name: MMMLU
+    category: Language / Understanding
+    paper: https://huggingface.co/datasets/openai/MMMLU
+    configpath:
+      - opencompass/configs/datasets/mmmlu/mmmlu_gen.py
+      - opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
+    configpath_llmjudge: ''
+- multirc:
+    name: SuperGLUE / MultiRC
+    category: Understanding
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
+    configpath_llmjudge: ''
+- multipl_e:
+    name: MultiPL-E
+    category: Code
+    paper: https://arxiv.org/pdf/2210.14868
+    configpath: opencompass/configs/datasets/multipl_e
+    configpath_llmjudge: ''
+- narrativeqa:
+    name: NarrativeQA
+    category: Understanding
+    paper: https://github.com/google-deepmind/narrativeqa
+    configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
+    configpath_llmjudge: ''
+- natural_question:
+    name: NaturalQuestions
+    category: Knowledge
+    paper: https://github.com/google-research-datasets/natural-questions
+    configpath: opencompass/configs/datasets/nq/nq_gen.py
+    configpath_llmjudge: ''
+- natural_question_cn:
+    name: NaturalQuestions-CN
+    category: Knowledge
+    paper: ''
+    configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
+    configpath_llmjudge: ''
+- obqa:
+    name: OpenBookQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1809.02789v1
+    configpath: opencompass/configs/datasets/obqa/obqa_gen.py
+    configpath_llmjudge: ''
+- olymmath:
+    name: OlymMATH
+    category: Math
+    paper: https://arxiv.org/abs/2503.21380
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
+- piqa:
+    name: OpenBookQA
+    category: Knowledge / Physics
+    paper: https://arxiv.org/pdf/1911.11641v1
+    configpath: opencompass/configs/datasets/piqa/piqa_gen.py
+    configpath_llmjudge: ''
+- py150:
+    name: py150
+    category: Code
+    paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
+    configpath: opencompass/configs/datasets/py150/py150_gen.py
+    configpath_llmjudge: ''
+- qasper:
+    name: Qasper
+    category: Long Context
+    paper: https://arxiv.org/pdf/2105.03011
+    configpath: opencompass/configs/datasets/qasper/qasper_gen.py
+    configpath_llmjudge: ''
+- qaspercut:
+    name: Qasper-Cut
+    category: Long Context
+    paper: ''
+    configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
+    configpath_llmjudge: ''
+- race:
+    name: RACE
+    category: Examination
+    paper: https://arxiv.org/pdf/1704.04683
+    configpath: opencompass/configs/datasets/race/race_gen.py
+    configpath_llmjudge: ''
+- realtoxicprompts:
+    name: RealToxicPrompts
+    category: Safety
+    paper: https://arxiv.org/pdf/2009.11462
+    configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
+    configpath_llmjudge: ''
+- record:
+    name: SuperGLUE / ReCoRD
+    category: Understanding
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
+    configpath_llmjudge: ''
+- rte:
+    name: SuperGLUE / RTE
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
+    configpath_llmjudge: ''
+- ocnli:
+    name: CLUE / OCNLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
+    configpath_llmjudge: ''
+- ocnlifc:
+    name: FewCLUE / OCNLI-FC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
+    configpath_llmjudge: ''
+- rolebench:
+    name: RoleBench
+    category: Role Play
+    paper: https://arxiv.org/pdf/2310.00746
+    configpath: opencompass/configs/datasets/rolebench
+    configpath_llmjudge: ''
+- s3eval:
+    name: S3Eval
+    category: Long Context
+    paper: https://aclanthology.org/2024.naacl-long.69.pdf
+    configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
+    configpath_llmjudge: ''
+- scibench:
+    name: SciBench
+    category: Reasoning
+    paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
+    configpath: opencompass/configs/datasets/scibench/scibench_gen.py
+    configpath_llmjudge: ''
+- scicode:
+    name: SciCode
+    category: Code
+    paper: https://arxiv.org/pdf/2407.13168
+    configpath: opencompass/configs/datasets/scicode/scicode_gen.py
+    configpath_llmjudge: ''
+- simpleqa:
+    name: SimpleQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2411.04368
+    configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
+    configpath_llmjudge: ''
+- siqa:
+    name: SocialIQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1904.09728
+    configpath: opencompass/configs/datasets/siqa/siqa_gen.py
+    configpath_llmjudge: ''
+- squad20:
+    name: SQuAD2.0
+    category: Understanding
+    paper: https://arxiv.org/pdf/1806.03822
+    configpath: opencompass/configs/datasets/squad20/squad20_gen.py
+    configpath_llmjudge: ''
+- storycloze:
+    name: StoryCloze
+    category: Reasoning
+    paper: https://aclanthology.org/2022.emnlp-main.616.pdf
+    configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
+    configpath_llmjudge: ''
+- strategyqa:
+    name: StrategyQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2101.02235
+    configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
+    configpath_llmjudge: ''
+- summedits:
+    name: SummEdits
+    category: Language
+    paper: https://aclanthology.org/2023.emnlp-main.600.pdf
+    configpath: opencompass/configs/datasets/summedits/summedits_gen.py
+    configpath_llmjudge: ''
+- summscreen:
+    name: SummScreen
+    category: Understanding
+    paper: https://arxiv.org/pdf/2104.07091v1
+    configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
+    configpath_llmjudge: ''
+- svamp:
+    name: SVAMP
+    category: Math
+    paper: https://aclanthology.org/2021.naacl-main.168.pdf
+    configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
+    configpath_llmjudge: ''
+- tabmwp:
+    name: TabMWP
+    category: Math / Table
+    paper: https://arxiv.org/pdf/2209.14610
+    configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
+    configpath_llmjudge: ''
+- taco:
+    name: TACO
+    category: Code
+    paper: https://arxiv.org/pdf/2312.14852
+    configpath: opencompass/configs/datasets/taco/taco_gen.py
+    configpath_llmjudge: ''
+- tnews:
+    name: FewCLUE / TNEWS
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
+    configpath_llmjudge: ''
+- bustm:
+    name: FewCLUE / BUSTM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
+    configpath_llmjudge: ''
+- csl:
+    name: FewCLUE / CSL
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
+    configpath_llmjudge: ''
+- ocnli_fc:
+    name: FewCLUE / OCNLI-FC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
+    configpath_llmjudge: ''
+- triviaqa:
+    name: TriviaQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1705.03551v2
+    configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
+    configpath_llmjudge: ''
+- triviaqarc:
+    name: TriviaQA-RC
+    category: Knowledge / Understanding
+    paper: ''
+    configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
+    configpath_llmjudge: ''
+- truthfulqa:
+    name: TruthfulQA
+    category: Safety
+    paper: https://arxiv.org/pdf/2109.07958v2
+    configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
+    configpath_llmjudge: ''
+- tydiqa:
+    name: TyDi-QA
+    category: Language
+    paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
+    configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
+    configpath_llmjudge: ''
+- wic:
+    name: SuperGLUE / WiC
+    category: Language
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
+    configpath_llmjudge: ''
+- wsc:
+    name: SuperGLUE / WSC
+    category: Language / WSC
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
+    configpath_llmjudge: ''
+- winogrande:
+    name: WinoGrande
+    category: Language / WSC
+    paper: https://arxiv.org/pdf/1907.10641v2
+    configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
+    configpath_llmjudge: ''
+- xcopa:
+    name: XCOPA
+    category: Language
+    paper: https://arxiv.org/pdf/2005.00333
+    configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
+    configpath_llmjudge: ''
+- xiezhi:
+    name: Xiezhi
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2306.05783
+    configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
+    configpath_llmjudge: ''
+- xlsum:
+    name: XLSum
+    category: Understanding
+    paper: https://arxiv.org/pdf/2106.13822v1
+    configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
+    configpath_llmjudge: ''
+- xsum:
+    name: Xsum
+    category: Understanding
+    paper: https://arxiv.org/pdf/1808.08745
+    configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
+    configpath_llmjudge: ''
+- cola:
+    name: GLUE / CoLA
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
+    configpath_llmjudge: ''
+- mprc:
+    name: GLUE / MPRC
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
+    configpath_llmjudge: ''
+- qqp:
+    name: GLUE / QQP
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
+    configpath_llmjudge: ''
+- omni_math:
+    name: Omni-MATH
+    category: Math
+    paper: https://omni-math.github.io/
+    configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
+    configpath_llmjudge: ''
+- wikibench:
+    name: WikiBench
+    category: Knowledge
+    paper: ''
+    configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
+    configpath_llmjudge: ''
+- supergpqa:
+    name: SuperGPQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2502.14739
+    configpath: opencompass/configs/datasets/supergpqa
+    configpath_llmjudge: ''
--- a/docs/en/_static/js/custom.js
+++ b/docs/en/_static/js/custom.js
@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['Dataset Statistics'];

 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
    "stateSave": false,
    "lengthChange": false,
    "pageLength": 20,
-    "order": []
+    "order": [],
+    "language": {
+      "info": "Show _START_ to _END_ Items（Totally _TOTAL_ ）",
+      "infoFiltered": "（Filtered from _MAX_ Items）",
+      "search": "Search：",
+      "zeroRecords": "Item Not Found",
+      "paginate": {
+        "next": "Next",
+        "previous": "Previous"
+      },
+    }
  });
 });
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -0,0 +1,370 @@
+# LLM as Judge Evaluation
+
+## Introduction
+
+The GenericLLMEvaluator is particularly useful for scenarios where rule-based methods (like regular expressions) cannot perfectly judge outputs, such as:
+
+- Cases where models output answer content without option identifiers
+- Factual judgment datasets that are difficult to evaluate with rules
+- Open-ended responses requiring complex understanding and reasoning
+- Evaluation that requires a lot of rules to be designed
+
+OpenCompass provides the GenericLLMEvaluator component to facilitate LLM-as-judge evaluations.
+
+## Dataset Format
+
+The dataset for LLM judge evaluation should be in either JSON Lines (.jsonl) or CSV format. Each entry should contain at least:
+
+- A problem or question
+- A reference answer or gold standard
+- (The model's prediction will be generated during evaluation)
+
+Example JSONL format:
+
+```json
+{"problem": "What is the capital of France?", "answer": "Paris"}
+```
+
+Example CSV format:
+
+```csv
+problem,answer
+"What is the capital of France?","Paris"
+```
+
+## Configuration
+
+### Using LLM for Evaluation via Command Line
+
+Some datasets in OpenCompass already include LLM judge configurations.
+You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
+
+Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
+```
+
+Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
+
+### Using LLM for Evaluation via Configuration Files
+
+To set up an LLM judge evaluation, you'll need to configure three main components:
+
+1. Dataset Reader Configuration
+
+```python
+reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='answer'      # Column name for the reference answer
+)
+```
+
+2. Inference Configuration
+
+```python
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',  # Template for prompting the model
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. Evaluation Configuration with LLM Judge
+
+```python
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,  # Using LLM as evaluator
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # Template for the judge
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # Configuration for the judge model
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # Post-processing the judge's output
+    ),
+)
+```
+
+## Using CustomDataset with GenericLLMEvaluator
+
+Here's how to set up a complete configuration for LLM judge evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# Import your judge model configuration
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as judge_model,
+    )
+
+# Define your judge template
+JUDGE_TEMPLATE = """
+Please evaluate whether the following response correctly answers the question.
+Question: {problem}
+Reference Answer: {answer}
+Model Response: {prediction}
+
+Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""".strip()
+
+# Dataset reader configuration
+reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# Inference configuration for the model being evaluated
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration with LLM judge
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=judge_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-dataset',
+        path='path/to/your/dataset',
+        file_name='your_dataset.jsonl',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+# Model configuration for the model being evaluated
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='model-to-evaluate',
+        path='path/to/your/model',
+        # ... other model configurations
+    )
+]
+
+# Output directory
+work_dir = './outputs/llm_judge_eval'
+```
+
+## GenericLLMEvaluator
+
+The GenericLLMEvaluator is designed to use an LLM as a judge for evaluating model outputs. Key features include:
+
+1. Flexible prompt templates for instructing the judge
+2. Support for various judge models (local or API-based)
+3. Customizable evaluation criteria through prompt engineering
+4. Post-processing of judge outputs to extract structured evaluations
+
+**Important Note**: The current generic version of the judge template only supports outputs in the format of "A" (correct) or "B" (incorrect), and does not support other output formats (like "CORRECT" or "INCORRECT"). This is because the post-processing function `generic_llmjudge_postprocess` is specifically designed to parse this format.
+
+The evaluator works by:
+
+1. Taking the original problem, reference answer, and model prediction
+2. Formatting them into a prompt for the judge model
+3. Parsing the judge's response to determine the evaluation result (looking for "A" or "B")
+4. Aggregating results across the dataset
+
+If you would like to see the full details of evaluation results, you can add `--dump-eval-details` to the command line when you start the job.
+Example evaluation output:
+
+```python
+{
+    'accuracy': 75.0,  # Percentage of responses judged as correct
+    'details': [
+        {
+            'origin_prompt': """
+            Please evaluate whether the following response correctly answers the question.
+            Question: What is the capital of France?
+            Reference Answer: Paris
+            Model Response: Paris
+            Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""",
+            'gold': 'Paris',
+            'prediction': 'A',
+        },
+        # ... more results
+    ]
+}
+```
+
+## CascadeEvaluator
+
+OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
+
+1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
+
+2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
+
+### Configuring CascadeEvaluator
+
+Here's an example of how to configure the CascadeEvaluator:
+
+```python
+# Define a rule-based evaluator
+rule_evaluator = dict(type=MATHEvaluator)
+
+# Define an LLM judge evaluator
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # Can use environment variables to configure the judge model
+)
+
+# Configure cascade evaluator (cascade mode)
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # Cascade mode
+)
+
+# For parallel mode, set parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # Parallel mode
+)
+
+# Use the cascade evaluator in your dataset evaluation config
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### Evaluation Results
+
+The cascade evaluator outputs detailed evaluation statistics including:
+
+- Accuracy of the rule-based evaluation
+- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
+- Final combined accuracy
+
+Example output:
+
+```python
+{
+    'accuracy': 85.0,  # Final accuracy
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # Number of samples correct by rule evaluation
+        'rule_accuracy': 70.0,  # Accuracy of rule evaluation
+        'llm_evaluated': 30,  # Number of samples evaluated by LLM (failed samples in cascade mode)
+        'llm_correct': 15,  # Number of samples correct by LLM evaluation
+        'llm_accuracy': 50.0,  # Accuracy of LLM evaluation
+        'final_correct': 85,  # Total correct samples
+        'final_accuracy': 85.0,  # Final accuracy
+        'parallel_mode': False,  # Whether parallel mode was used
+    },
+    'details': [
+        # Detailed evaluation results for each sample
+    ]
+}
+```
+
+The cascade evaluator is particularly useful for:
+
+1. Scenarios that require balancing evaluation cost and accuracy
+2. Cases where rule-based evaluators are available but might not be comprehensive
+3. Evaluation tasks that need more nuanced judgment for edge cases
+
+## Complete Example
+
+For a complete working example using GenericLLMEvaluator
+, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
+
+For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
--- a/docs/en/advanced_guides/math_verify.md
+++ b/docs/en/advanced_guides/math_verify.md
@ -0,0 +1,190 @@
+# General Math Evaluation Guidance
+
+## Introduction
+
+Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
+
+## Dataset Format
+
+The math evaluation dataset should be in either JSON Lines (.jsonl) or CSV format. Each problem should contain at least:
+
+- A problem statement
+- A solution/answer (typically in LaTeX format with the final answer in \\boxed{})
+
+Example JSONL format:
+
+```json
+{"problem": "Find the value of x if 2x + 3 = 7", "solution": "Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"}
+```
+
+Example CSV format:
+
+```csv
+problem,solution
+"Find the value of x if 2x + 3 = 7","Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"
+```
+
+## Configuration
+
+To evaluate mathematical reasoning, you'll need to set up three main components:
+
+1. Dataset Reader Configuration
+
+```python
+math_reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='solution'    # Column name for the answer
+)
+```
+
+2. Inference Configuration
+
+```python
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. Evaluation Configuration
+
+```python
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+```
+
+## Using CustomDataset
+
+Here's how to set up a complete configuration for math evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',              # Dataset abbreviation
+        path='path/to/your/dataset',         # Path to your dataset file
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+```
+
+## MATHEvaluator
+
+The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
+
+The MATHEvaluator implements:
+
+1. Extracts answers from both predictions and references using LaTeX extraction
+2. Handles various LaTeX formats and environments
+3. Verifies mathematical equivalence between predicted and reference answers
+4. Provides detailed evaluation results including:
+   - Accuracy score
+   - Detailed comparison between predictions and references
+   - Parse results of both predicted and reference answers
+
+The evaluator supports:
+
+- Basic arithmetic operations
+- Fractions and decimals
+- Algebraic expressions
+- Trigonometric functions
+- Roots and exponents
+- Mathematical symbols and operators
+
+Example evaluation output:
+
+```python
+{
+    'accuracy': 85.0,  # Percentage of correct answers
+    'details': [
+        {
+            'predictions': 'x = 2',           # Parsed prediction
+            'references': 'x = 2',         # Parsed reference
+            'correct': True            # Whether they match
+        },
+        # ... more results
+    ]
+}
+```
+
+## Complete Example
+
+Here's a complete example of how to set up math evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+
+# Dataset configuration
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',
+        path='path/to/your/dataset.jsonl',  # or .csv
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# Model configuration
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='your-model-name',
+        path='your/model/path',
+        # ... other model configurations
+    )
+]
+
+# Output directory
+work_dir = './outputs/math_eval'
+```
--- a/docs/en/advanced_guides/new_dataset.md
+++ b/docs/en/advanced_guides/new_dataset.md
@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
        return dataset
   ```

+3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
+
+   - The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
+
+   ```
+   - mydataset:
+       name: MyDataset
+       category: Understanding
+       paper: https://arxiv.org/pdf/xxxxxxx
+       configpath: opencompass/configs/datasets/MyDataset
+   ```
+
   Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.
--- a/docs/en/advanced_guides/persistence.md
+++ b/docs/en/advanced_guides/persistence.md
@ -0,0 +1,65 @@
+# Evaluation Results Persistence
+
+## Introduction
+
+Normally, the evaluation results of OpenCompass will be saved to your work directory. But in some cases, there may be a need for data sharing among users or quickly browsing existing public evaluation results. Therefore, we provide an interface that can quickly transfer evaluation results to external public data stations, and on this basis, provide functions such as uploading, overwriting, and reading.
+
+## Quick Start
+
+### Uploading
+
+By adding `args` to the evaluation command or adding configuration in the Eval script, the results of evaluation can be stored in the path you specify. Here are the examples:
+
+(Approach 1) Add an `args` option to the command and specify your public path address.
+
+```bash
+opencompass  ...  -sp '/your_path'
+```
+
+(Approach 2) Add configuration in the Eval script.
+
+```pythonE
+station_path = '/your_path'
+```
+
+### Overwriting
+
+The above storage method will first determine whether the same task result already exists in the data station based on the `abbr` attribute in the model and dataset configuration before uploading data. If results already exists, cancel this storage. If you need to update these results, please add the `station-overwrite` option to the command, here is an example:
+
+```bash
+opencompass  ...  -sp '/your_path' --station-overwrite
+```
+
+### Reading
+
+You can directly read existing results from the data station to avoid duplicate evaluation tasks. The read results will directly participate in the 'summarize' step. When using this configuration, only tasks that do not store results in the data station will be initiated. Here is an example:
+
+```bash
+opencompass  ...  -sp '/your_path' --read-from-station
+```
+
+### Command Combination
+
+1. Only upload the results under your latest working directory to the data station, without supplementing tasks that missing results:
+
+```bash
+opencompass  ...  -sp '/your_path' -r latest -m viz
+```
+
+## Storage Format of the Data Station
+
+In the data station, the evaluation results are stored as `json` files for each `model-dataset` pair. The specific directory form is `/your_path/dataset_name/model_name.json `. Each `json` file stores a dictionary corresponding to the results, including `predictions`, `results`, and `cfg`, here is an example:
+
+```pythonE
+Result = {
+    'predictions': List[Dict],
+    'results': Dict,
+    'cfg': Dict = {
+        'models': Dict,
+        'datasets': Dict,
+        (Only subjective datasets)'judge_models': Dict
+    }
+}
+```
+
+Among this three keys, `predictions` records the predictions of the model on each item of data in the dataset. `results` records the total score of the model on the dataset. `cfg` records detailed configurations of the model and the dataset in this evaluation task.
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]

+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
@ -220,3 +224,11 @@ autodoc_typehints = 'none'

 # The not found page
 notfound_template = '404.html'
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./statis.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@ -39,8 +39,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   user_guides/evaluation.md
   user_guides/experimentation.md
   user_guides/metrics.md
-   user_guides/summarizer.md
-   user_guides/corebench.md
+   user_guides/deepseek_r1.md

 .. _Prompt:
 .. toctree::
@ -62,16 +61,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   advanced_guides/custom_dataset.md
   advanced_guides/new_model.md
   advanced_guides/evaluation_lmdeploy.md
-   advanced_guides/evaluation_lightllm.md
   advanced_guides/accelerator_intro.md
+   advanced_guides/math_verify.md
+   advanced_guides/llm_judge.md
   advanced_guides/code_eval.md
   advanced_guides/code_eval_service.md
-   advanced_guides/prompt_attack.md
-   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
-   advanced_guides/contamination_eval.md
-   advanced_guides/needleinahaystack_eval.md
+   advanced_guides/persistence.md

 .. _Tools:
 .. toctree::
@ -80,6 +76,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.

   tools.md

+.. _Dataset List:
+.. toctree::
+   :maxdepth: 1
+   :caption: Dataset List
+
+   dataset_statistics.md
+
 .. _Notes:
 .. toctree::
   :maxdepth: 1
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@ -0,0 +1,103 @@
+#! /usr/bin/env python
+
+from pathlib import Path
+
+import yaml
+from tabulate import tabulate
+
+OC_ROOT = Path(__file__).absolute().parents[2]
+GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
+DATASETZOO_TEMPLATE = """\
+# Dataset Statistics
+
+On this page, we have listed all the datasets supported by OpenCompass.
+
+You can use sorting and search functions to find the dataset you need.
+
+We provide recommended running configurations for each dataset,
+and in some datasets also offer recommended configurations based on LLM Judge.
+
+You can quickly start evaluation tasks based on the recommended configurations.
+However, please note that these configurations may be updated over time.
+
+"""
+
+with open('dataset_statistics.md', 'w') as f:
+    f.write(DATASETZOO_TEMPLATE)
+
+load_path = str(OC_ROOT / 'dataset-index.yml')
+
+with open(load_path, 'r') as f2:
+    data_list = yaml.load(f2, Loader=yaml.FullLoader)
+
+HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
+
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr', 'math500'
+]
+
+
+def table_format(data_list):
+    table_format_list = []
+    for i in data_list:
+        table_format_list_sub = []
+        for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[link]('
+            else:
+                link_token = '[link(TBD)]('
+
+            for index in HEADER:
+                if index == 'paper':
+                    table_format_list_sub.append('[link](' + i[j][index] + ')')
+                elif index == 'configpath_llmjudge':
+                    if i[j][index] == '':
+                        table_format_list_sub.append(i[j][index])
+                    else:
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                elif index == 'configpath':
+                    if isinstance(i[j][index], list):
+                        sub_list_text = ''
+                        for k in i[j][index]:
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
+                                              ') / ')
+                        table_format_list_sub.append(sub_list_text[:-2])
+                    else:
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                else:
+                    table_format_list_sub.append(i[j][index])
+        table_format_list.append(table_format_list_sub)
+    return table_format_list
+
+
+data_format_list = table_format(data_list)
+
+
+def generate_table(data_list, title=None):
+
+    with open('dataset_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: dataset\n""")
+        header = [
+            'Name', 'Category', 'Paper or Repository', 'Recommended Config',
+            'Recommended Config (LLM Judge)'
+        ]
+        table_cfg = dict(tablefmt='pipe',
+                         floatfmt='.2f',
+                         numalign='right',
+                         stralign='center')
+        f.write(tabulate(data_list, header, **table_cfg))
+        f.write('\n```\n')
+
+
+generate_table(
+    data_list=data_format_list,
+    title='## Supported Dataset List',
+)
--- a/docs/en/user_guides/datasets.md
+++ b/docs/en/user_guides/datasets.md
@ -81,3 +81,43 @@ datasets += cmnli_datasets
 Users can choose different abilities, different datasets and different evaluation methods configuration files to build the part of the dataset in the evaluation script according to their needs.

 For information on how to start an evaluation task and how to evaluate self-built datasets, please refer to the relevant documents.
+
+### Multiple Evaluations on the Dataset
+
+In the dataset configuration, you can set the parameter `n` to perform multiple evaluations on the same dataset and return the average metrics, for example:
+
+```python
+afqmc_datasets = [
+    dict(
+        abbr="afqmc-dev",
+        type=AFQMCDatasetV2,
+        path="./data/CLUE/AFQMC/dev.json",
+        n=10, # Perform 10 evaluations
+        reader_cfg=afqmc_reader_cfg,
+        infer_cfg=afqmc_infer_cfg,
+        eval_cfg=afqmc_eval_cfg,
+    ),
+]
+
+```
+
+Additionally, for binary evaluation metrics (such as accuracy, pass-rate, etc.), you can also set the parameter `k` in conjunction with `n` for [G-Pass@k](http://arxiv.org/abs/2412.13147) evaluation. The formula for G-Pass@k is:
+
+```{math}
+\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
+```
+
+where $n$ is the number of evaluations, and $c$ is the number of times that passed or were correct out of $n$ runs. An example configuration is as follows:
+
+```python
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        k=[2, 4], # Return results for G-Pass@2 and G-Pass@4
+        n=12, # 12 evaluations
+        ...
+    )
+]
+```
--- a/docs/en/user_guides/deepseek_r1.md
+++ b/docs/en/user_guides/deepseek_r1.md
@ -0,0 +1,192 @@
+# Tutorial for Evaluating Reasoning Models
+
+OpenCompass provides an evaluation tutorial for DeepSeek R1 series reasoning models (mathematical datasets).
+
+- At the model level, we recommend using the sampling approach to reduce repetitions caused by greedy decoding
+- For datasets with limited samples, we employ multiple evaluation runs and take the average
+- For answer validation, we utilize LLM-based verification to reduce misjudgments from rule-based evaluation
+
+## Installation and Preparation
+
+Please follow OpenCompass's installation guide.
+
+## Evaluation Configuration Setup
+
+We provide example configurations in `examples/eval_deepseek_r1.py`. Below is the configuration explanation:
+
+### Configuration Interpretation
+
+#### 1. Dataset and Validator Configuration
+
+```python
+# Configuration supporting multiple runs (example)
+from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# LLM validator configuration. Users need to deploy API services via LMDeploy/vLLM/SGLang or use OpenAI-compatible endpoints
+verifier_cfg = dict(
+    abbr='qwen2-5-32B-Instruct',
+    type=OpenAISDK,
+    path='Qwen/Qwen2.5-32B-Instruct',  # Replace with actual path
+    key='YOUR_API_KEY',  # Use real API key
+    openai_api_base=['http://your-api-endpoint'],  # Replace with API endpoint
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    max_out_len=16384
+)
+
+# Apply validator to all datasets
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+```
+
+#### 2. Model Configuration
+
+We provided an example of evaluation based on LMDeploy as the reasoning model backend, users can modify path (i.e., HF path)
+
+```python
+# LMDeploy model configuration example
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768
+        ),
+        max_seq_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # Extendable 14B/32B configurations...
+]
+```
+
+#### 3. Evaluation Process Configuration
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+    
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
+```
+
+#### 4. Summary Configuration
+
+```python
+# Multiple runs results average configuration
+summary_groups = [
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    # Other dataset average configurations...
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['AIME2024-Aveage8', 'naive_average'],
+        # Other dataset metrics...
+    ],
+    summary_groups=summary_groups
+)
+
+# Work directory configuration
+work_dir = "outputs/deepseek_r1_reasoning"
+```
+
+## Evaluation Execution
+
+### Scenario 1: Model loaded on 1 GPU, data evaluated by 1 worker, using a total of 1 GPU
+
+```bash
+opencompass examples/eval_deepseek_r1.py --debug --dump-eval-details
+```
+
+Evaluation logs will be output in the command line.
+
+### Scenario 2: Model loaded on 1 GPU, data evaluated by 8 workers, using a total of 8 GPUs
+
+You need to modify the `infer` configuration in the configuration file and set `num_worker` to 8
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+At the same time, remove the `--debug` parameter from the evaluation command
+
+```bash
+opencompass examples/eval_deepseek_r1.py --dump-eval-details
+```
+
+In this mode, OpenCompass will use multithreading to start `$num_worker` tasks. Specific logs will not be displayed in the command line, instead, detailed evaluation logs will be shown under `$work_dir`.
+
+### Scenario 3: Model loaded on 2 GPUs, data evaluated by 4 workers, using a total of 8 GPUs
+
+Note that in the model configuration, `num_gpus` in `run_cfg` needs to be set to 2 (if using an inference backend, parameters such as `tp` in LMDeploy also need to be modified accordingly to 2), and at the same time, set `num_worker` in the `infer` configuration to 4
+
+```python
+models += [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=128,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+```
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+### Evaluation Results
+
+The evaluation results are displayed as follows:
+
+```bash
+dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
+
+```
+
+## Performance Baseline
+
+Since the model uses Sampling for decoding, and the AIME dataset size is small, there may still be a performance fluctuation of 1-3 points even when averaging over 8 evaluations.
+
+| Model                        | Dataset  | Metric   | Value |
+| ---------------------------- | -------- | -------- | ----- |
+| DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3  |
+| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2  |
+| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2  |
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@ -57,7 +57,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.
+- `--dump-eval-details`: Default enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。

 Using run mode `-m all` as an example, the overall execution flow is as follows:

--- a/docs/zh_cn/_static/js/custom.js
+++ b/docs/zh_cn/_static/js/custom.js
@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['数据集统计'];

 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
    "stateSave": false,
    "lengthChange": false,
    "pageLength": 20,
-    "order": []
+    "order": [],
+    "language": {
+      "info": "显示 _START_ 至 _END_ 条目（总计 _TOTAL_ ）",
+      "infoFiltered": "（筛选自 _MAX_ 条目）",
+      "search": "搜索：",
+      "zeroRecords": "没有找到任何条目",
+      "paginate": {
+        "next": "下一页",
+        "previous": "上一页"
+      },
+    }
  });
 });
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -0,0 +1,368 @@
+# LLM 作为评判器
+
+## 简介
+
+GenericLLMEvaluator组件特别适用于那些难以通过规则式方法（如正则表达式）进行完美判断的场景，例如：
+
+- 模型不输出选项标识而只输出选项内容的情况
+- 需要事实性判断的数据集
+- 需要复杂理解和推理的开放式回答
+- 需要设计大量规则的判断
+
+OpenCompass提供了GenericLLMEvaluator组件来实现LLM作为评判器的评估。
+
+## 数据集格式
+
+用于LLM评判的数据集应该是JSON Lines (.jsonl)或CSV格式。每个条目至少应包含：
+
+- 问题或任务
+- 参考答案或标准答案
+- (模型的预测将在评估过程中生成)
+
+JSONL格式示例：
+
+```json
+{"problem": "法国的首都是什么？", "answer": "巴黎"}
+```
+
+CSV格式示例：
+
+```csv
+problem,answer
+"法国的首都是什么？","巴黎"
+```
+
+## 配置说明
+
+### 基于命令行使用LLM进行评估
+
+OpenCompass中部分数据集已经包含了LLM评判器的配置。
+你需要使用一个模型服务（如OpenAI或DeepSeek官方提供的API）或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
+
+然后，你可以通过以下命令设置相关评估服务的环境变量，并对模型进行评估：
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 
+```
+
+注意，默认情况下，OpenCompass会使用这三个环境变量，但如果你使用了基于配置文件的方式配置评估服务，这三个环境变量将不会生效。
+
+### 基于配置文件使用LLM进行评估
+
+对一个数据集设置LLM评判评估，你需要配置三个主要组件：
+
+1. 数据集读取配置
+
+```python
+reader_cfg = dict(
+    input_columns=['problem'],  # 问题列的名称
+    output_column='answer'      # 参考答案列的名称
+)
+```
+
+2. 推理配置
+
+```python
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',  # 提示模型的模板
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. 使用LLM评判器的评估配置
+
+```python
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,  # 使用LLM作为评估器
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # 评判器的模板
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # 评判模型的配置
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # 处理评判器输出的后处理器
+    ),
+)
+```
+
+## 使用CustomDataset和GenericLLMEvaluator
+
+以下是如何设置完整的LLM评判评估配置：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# 导入评判模型配置
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as judge_model,
+    )
+
+# 定义评判模板
+JUDGE_TEMPLATE = """
+请评估以下回答是否正确地回答了问题。
+问题：{problem}
+参考答案：{answer}
+模型回答：{prediction}
+
+模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。
+""".strip()
+
+# 数据集读取配置
+reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# 被评估模型的推理配置
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# 使用LLM评判器的评估配置
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=judge_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+# 数据集配置
+datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-dataset',
+        path='path/to/your/dataset',
+        file_name='your_dataset.jsonl',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+# 被评估模型的配置
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='model-to-evaluate',
+        path='path/to/your/model',
+        # ... 其他模型配置
+    )
+]
+
+# 输出目录
+work_dir = './outputs/llm_judge_eval'
+```
+
+## GenericLLMEvaluator
+
+GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。主要特点包括：
+
+1. 灵活的提示模板，用于指导评判器
+2. 支持各种评判模型（本地或基于API）
+3. 通过提示工程自定义评估标准
+4. 对评判器输出进行后处理以提取结构化评估
+
+**重要说明**：目前通用版本的评判模板只支持输出"A"（正确）或"B"（不正确）的格式，不支持其他输出格式（如"正确"或"不正确"）。这是因为后处理函数`generic_llmjudge_postprocess`专门设计为解析这种格式。
+
+评估器的工作原理：
+
+1. 获取原始问题、参考答案和模型预测
+2. 将它们格式化为评判模型的提示
+3. 解析评判器的响应以确定评估结果（寻找"A"或"B"）
+4. 汇总整个数据集的结果
+
+如果需要查看评估的详细结果，可以在启动任务时添加`--dump-eval-details`到命令行。
+评估输出示例：
+
+```python
+{
+    'accuracy': 75.0,  # 被判断为正确的回答百分比
+    'details': [
+        {
+            'origin_prompt': """
+            请评估以下回答是否正确地回答了问题。
+            问题：法国的首都是什么？
+            参考答案：巴黎
+            模型回答：法国的首都是巴黎。
+            模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。""",
+            'gold': '巴黎',
+            'prediction': 'A',
+        },
+        # ... 更多结果
+    ]
+}
+```
+
+## 级联评估器 (CascadeEvaluator)
+
+OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式评估和LLM评估的优势。级联评估器有两种模式：
+
+1. **级联模式（Cascade Mode, parallel=False）**：首先使用规则式评估器评估所有样本，然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖，从而降低评估成本和时间。
+
+2. **并行模式（Parallel Mode, parallel=True）**：使用规则式评估器和LLM评判器同时评估所有样本，如果任何一个评估器认为样本是正确的，则将该样本视为正确。这种方式可以提高评估的宽容度，但可能会导致更高的成本，因为所有样本都需要LLM评估。
+
+### 配置CascadeEvaluator
+
+以下是配置`CascadeEvaluator`的示例：
+
+```python
+# 定义规则式评估器
+rule_evaluator = dict(type=MATHEvaluator)
+
+# 定义LLM评判器
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # 可以使用环境变量配置评判模型
+)
+
+# 配置级联评估器（级联模式）
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # 级联模式
+)
+
+# 如果需要并行模式，可以设置parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # 并行模式
+)
+
+# 在数据集评估配置中使用级联评估器
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### 评估结果
+
+级联评估器会输出详细的评估统计信息，包括：
+
+- 规则评估的准确率
+- LLM评估的准确率（针对规则评估失败的样本）
+- 最终的综合准确率
+
+输出示例：
+
+```python
+{
+    'accuracy': 85.0,  # 最终准确率
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # 规则评估认为正确的样本数
+        'rule_accuracy': 70.0,  # 规则评估的准确率
+        'llm_evaluated': 30,  # LLM评估的样本数（级联模式下为规则评估失败的样本数）
+        'llm_correct': 15,  # LLM评估认为正确的样本数
+        'llm_accuracy': 50.0,  # LLM评估的准确率
+        'final_correct': 85,  # 最终正确的样本数
+        'final_accuracy': 85.0,  # 最终准确率
+        'parallel_mode': False,  # 是否是并行模式
+    },
+    'details': [
+        # 每个样本的详细评估结果
+    ]
+}
+```
+
+级联评估器特别适用于：
+
+1. 需要平衡评估成本和准确性的场景
+2. 有可用的规则式评估器但可能不够完善的情况
+3. 需要对边界情况进行更精确判断的评估任务
+
+## 完整示例
+
+如果希望了解通用LLM评判器，请参考examples目录中的`eval_llm_judge.py`文件，该示例展示了如何使用LLM评判器评估数学问题。
+
+如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件，该示例展示了如何使用级联评估器评估数学问题。
--- a/docs/zh_cn/advanced_guides/math_verify.md
+++ b/docs/zh_cn/advanced_guides/math_verify.md
@ -0,0 +1,190 @@
+# 数学能力评测
+
+## 简介
+
+数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
+
+## 数据集格式
+
+数学评测数据集应该是 JSON Lines (.jsonl) 或 CSV 格式。每个问题至少应包含：
+
+- 问题陈述
+- 解答/答案（通常使用 LaTeX 格式，最终答案需要用 \\boxed{} 括起来）
+
+JSONL 格式示例：
+
+```json
+{"problem": "求解方程 2x + 3 = 7", "solution": "让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"}
+```
+
+CSV 格式示例：
+
+```csv
+problem,solution
+"求解方程 2x + 3 = 7","让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"
+```
+
+## 配置说明
+
+要进行数学推理评测，你需要设置三个主要组件：
+
+1. 数据集读取配置
+
+```python
+math_reader_cfg = dict(
+    input_columns=['problem'],  # 问题列的名称
+    output_column='solution'    # 答案列的名称
+)
+```
+
+2. 推理配置
+
+```python
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. 评测配置
+
+```python
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+```
+
+## 使用 CustomDataset
+
+以下是如何设置完整的数学评测配置：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',              # 数据集简称
+        path='path/to/your/dataset',         # 数据集文件路径
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+```
+
+## MATHEvaluator
+
+MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
+
+MATHEvaluator 具有以下功能：
+
+1. 使用 LaTeX 提取器从预测和参考答案中提取答案
+2. 处理各种 LaTeX 格式和环境
+3. 验证预测答案和参考答案之间的数学等价性
+4. 提供详细的评测结果，包括：
+   - 准确率分数
+   - 预测和参考答案的详细比较
+   - 预测和参考答案的解析结果
+
+评测器支持：
+
+- 基本算术运算
+- 分数和小数
+- 代数表达式
+- 三角函数
+- 根式和指数
+- 数学符号和运算符
+
+评测输出示例：
+
+```python
+{
+    'accuracy': 85.0,  # 正确答案的百分比
+    'details': [
+        {
+            'predictions': 'x = 2',           # 解析后的预测答案
+            'references': 'x = 2',         # 解析后的参考答案
+            'correct': True            # 是否匹配
+        },
+        # ... 更多结果
+    ]
+}
+```
+
+## 完整示例
+
+以下是设置数学评测的完整示例：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# 数据集读取配置
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+# 推理配置
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# 评测配置
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+
+# 数据集配置
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',
+        path='path/to/your/dataset.jsonl',  # 或 .csv
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# 模型配置
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='your-model-name',
+        path='your/model/path',
+        # ... 其他模型配置
+    )
+]
+
+# 输出目录
+work_dir = './outputs/math_eval'
+```
--- a/docs/zh_cn/advanced_guides/new_dataset.md
+++ b/docs/zh_cn/advanced_guides/new_dataset.md
@ -91,4 +91,16 @@
        return dataset
   ```

+3. 在完成数据集脚本和配置文件的构建后，需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息，以使其加入OpenCompass官网Doc的数据集统计列表中。
+
+   - 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下：
+
+   ```
+   - mydataset:
+       name: MyDataset
+       category: Understanding
+       paper: https://arxiv.org/pdf/xxxxxxx
+       configpath: opencompass/configs/datasets/MyDataset
+   ```
+
 详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程，启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。
--- a/docs/zh_cn/advanced_guides/persistence.md
+++ b/docs/zh_cn/advanced_guides/persistence.md
@ -0,0 +1,65 @@
+# 评测结果持久化
+
+## 介绍
+
+通常情况下，OpenCompass的评测结果将会保存到工作目录下。 但在某些情况下，可能会产生用户间的数据共享，以及快速查看已有的公共评测结果等需求。 因此，我们提供了一个能够将评测结果快速转存到外部公共数据站的接口，并且在此基础上提供了对数据站的上传、更新、读取等功能。
+
+## 快速开始
+
+### 向数据站存储数据
+
+通过在CLI评测指令中添加`args`或在Eval脚本中添加配置，即可将本次评测结果存储到您所指定的路径，示例如下：
+
+（方式1）在指令中添加`args`选项并指定你的公共路径地址。
+
+```bash
+opencompass  ...  -sp '/your_path'
+```
+
+（方式2）在Eval脚本中添加配置。
+
+```pythonE
+station_path = '/your_path'
+```
+
+### 向数据站更新数据
+
+上述存储方法在上传数据前会首先根据模型和数据集配置中的`abbr`属性来判断数据站中是否已有相同任务结果。若已有结果，则取消本次存储。如果您需要更新这部分结果，请在指令中添加`station-overwrite`选项，示例如下：
+
+```bash
+opencompass  ...  -sp '/your_path' --station-overwrite
+```
+
+### 读取数据站中已有的结果
+
+您可以直接从数据站中读取已有的结果，以避免重复进行评测任务。读取到的结果会直接参与到`summarize`步骤。采用该配置时，仅有数据站中未存储结果的任务会被启动。示例如下：
+
+```bash
+opencompass  ...  -sp '/your_path' --read-from-station
+```
+
+### 指令组合
+
+1. 仅向数据站上传最新工作目录下结果，不补充运行缺失结果的任务：
+
+```bash
+opencompass  ...  -sp '/your_path' -r latest -m viz
+```
+
+## 数据站存储格式
+
+在数据站中，评测结果按照每个`model-dataset`对的结果存储为`json`文件。具体的目录组织形式为`/your_path/dataset_name/model_name.json`。每个`json`文件都存储了对应结果的字典，包括`predictions`、`results`以及`cfg`三个子项，具体示例如下：
+
+```pythonE
+Result = {
+    'predictions': List[Dict],
+    'results': Dict,
+    'cfg': Dict = {
+        'models': Dict,
+        'datasets': Dict,
+        (Only subjective datasets)'judge_models': Dict
+    }
+}
+```
+
+其中，`predictions`记录了模型对数据集中每一条数据的prediction的结果，`results`记录了模型在该数据集上的评分，`cfg`记录了该评测任务中模型和数据集的详细配置。
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]

+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
@ -224,6 +228,7 @@ notfound_template = '404.html'

 def builder_inited_handler(app):
    subprocess.run(['./cp_origin_docs.sh'])
+    subprocess.run(['./statis.py'])


 def setup(app):
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@ -40,8 +40,7 @@ OpenCompass 上手路线
   user_guides/evaluation.md
   user_guides/experimentation.md
   user_guides/metrics.md
-   user_guides/summarizer.md
-   user_guides/corebench.md
+   user_guides/deepseek_r1.md

 .. _提示词:
 .. toctree::
@ -62,17 +61,13 @@ OpenCompass 上手路线
   advanced_guides/custom_dataset.md
   advanced_guides/new_model.md
   advanced_guides/evaluation_lmdeploy.md
-   advanced_guides/evaluation_lightllm.md
   advanced_guides/accelerator_intro.md
+   advanced_guides/math_verify.md
+   advanced_guides/llm_judge.md
   advanced_guides/code_eval.md
   advanced_guides/code_eval_service.md
-   advanced_guides/prompt_attack.md
-   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
-   advanced_guides/contamination_eval.md
-   advanced_guides/compassbench_intro.md
-   advanced_guides/needleinahaystack_eval.md
+   advanced_guides/persistence.md

 .. _工具:
 .. toctree::
@ -81,6 +76,13 @@ OpenCompass 上手路线

   tools.md

+.. _数据集列表:
+.. toctree::
+   :maxdepth: 1
+   :caption: 数据集列表
+
+   dataset_statistics.md
+
 .. _其他说明:
 .. toctree::
   :maxdepth: 1
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@ -0,0 +1,98 @@
+#! /usr/bin/env python
+
+from pathlib import Path
+
+import yaml
+from tabulate import tabulate
+
+OC_ROOT = Path(__file__).absolute().parents[2]
+GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
+DATASETZOO_TEMPLATE = """\
+# 数据集统计
+
+在本页面中，我们列举了OpenCompass所支持的所有数据集。
+
+你可以使用排序和搜索功能找到需要的数据集。
+
+我们对每一个数据集都给出了推荐的运行配置，部分数据集中还提供了基于LLM Judge的推荐配置。
+
+你可以基于推荐配置快速启动评测。但请注意，推荐配置可能随时间推移被更新。
+
+"""
+
+with open('dataset_statistics.md', 'w') as f:
+    f.write(DATASETZOO_TEMPLATE)
+
+load_path = str(OC_ROOT / 'dataset-index.yml')
+
+with open(load_path, 'r') as f2:
+    data_list = yaml.load(f2, Loader=yaml.FullLoader)
+
+HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
+
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr', 'math500'
+]
+
+
+def table_format(data_list):
+    table_format_list = []
+    for i in data_list:
+        table_format_list_sub = []
+        for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[链接]('
+            else:
+                link_token = '[链接(TBD)]('
+
+            for index in HEADER:
+                if index == 'paper':
+                    table_format_list_sub.append('[链接](' + i[j][index] + ')')
+                elif index == 'configpath_llmjudge':
+                    if i[j][index] == '':
+                        table_format_list_sub.append(i[j][index])
+                    else:
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                elif index == 'configpath':
+                    if isinstance(i[j][index], list):
+                        sub_list_text = ''
+                        for k in i[j][index]:
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
+                                              ') / ')
+                        table_format_list_sub.append(sub_list_text[:-2])
+                    else:
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                else:
+                    table_format_list_sub.append(i[j][index])
+        table_format_list.append(table_format_list_sub)
+    return table_format_list
+
+
+data_format_list = table_format(data_list)
+
+
+def generate_table(data_list, title=None):
+
+    with open('dataset_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: dataset\n""")
+        header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
+        table_cfg = dict(tablefmt='pipe',
+                         floatfmt='.2f',
+                         numalign='right',
+                         stralign='center')
+        f.write(tabulate(data_list, header, **table_cfg))
+        f.write('\n```\n')
+
+
+generate_table(
+    data_list=data_format_list,
+    title='## 支持数据集列表',
+)
--- a/docs/zh_cn/user_guides/datasets.md
+++ b/docs/zh_cn/user_guides/datasets.md
@ -81,3 +81,42 @@ datasets += cmnli_datasets
 用户可以根据需要，选择不同能力不同数据集以及不同评测方式的配置文件来构建评测脚本中数据集的部分。

 有关如何启动评测任务，以及如何评测自建数据集可以参考相关文档。
+
+### 数据集多次评测
+
+在数据集配置中可以通过设置参数`n`来对同一数据集进行多次评测，最终返回平均指标，例如：
+
+```python
+afqmc_datasets = [
+    dict(
+        abbr="afqmc-dev",
+        type=AFQMCDatasetV2,
+        path="./data/CLUE/AFQMC/dev.json",
+        n=10, # 进行10次评测
+        reader_cfg=afqmc_reader_cfg,
+        infer_cfg=afqmc_infer_cfg,
+        eval_cfg=afqmc_eval_cfg,
+    ),
+]
+```
+
+另外，对于二值评测指标（例如accuracy，pass-rate等），还可以通过设置参数`k`配合`n`进行[G-Pass@k](http://arxiv.org/abs/2412.13147)评测。G-Pass@k计算公式为：
+
+```{math}
+\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
+```
+
+其中 $n$ 为评测次数, $c$ 为 $n$ 次运行中通过或正确的次数。配置例子如下：
+
+```python
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        k=[2, 4], # 返回 G-Pass@2和G-Pass@4的结果
+        n=12, # 12次评测
+        ...
+    )
+]
+```
--- a/docs/zh_cn/user_guides/deepseek_r1.md
+++ b/docs/zh_cn/user_guides/deepseek_r1.md
@ -0,0 +1,192 @@
+# 强推理模型评测教程
+
+OpenCompass提供针对DeepSeek R1系列推理模型的评测教程（数学数据集）。
+
+- 在模型层面，我们建议使用Sampling方式，以减少因为Greedy评测带来的大量重复
+- 在数据集层面，我们对数据量较小的评测基准，使用多次评测并取平均的方式。
+- 在答案验证层面，为了减少基于规则评测带来的误判，我们统一使用基于LLM验证的方式进行评测。
+
+## 安装和准备
+
+请按OpenCompass安装教程进行安装。
+
+## 构建评测配置
+
+我们在 `example/eval_deepseek_r1.py` 中提供了示例配置，以下对评测配置进行解读
+
+### 评测配置解读
+
+#### 1. 数据集与验证器配置
+
+```python
+# 支持多运行次数的数据集配置（示例）
+from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# 设置LLM验证器， 用户需事先通过LMDeploy/vLLM/SGLang等工具启动API 评测服务器，或者直接使用兼容OpenAI标准接口的模型服务
+verifier_cfg = dict(
+    abbr='qwen2-5-32B-Instruct',
+    type=OpenAISDK,
+    path='Qwen/Qwen2.5-32B-Instruct',  # 需替换实际路径
+    key='YOUR_API_KEY',  # 需替换真实API Key
+    openai_api_base=['http://your-api-endpoint'],  # 需替换API地址
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    max_out_len=16384
+)
+
+# 应用验证器到所有数据集
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+```
+
+#### 2. 模型配置
+
+我们提供了基于LMDeploy作为推理后端的评测示例，用户可以通过修改path（即HF路径）
+
+```python
+# LMDeploy模型配置示例
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768
+        ),
+        max_seq_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # 可扩展14B/32B配置...
+]
+```
+
+#### 3. 评估流程配置
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+    
+# 评估配置
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
+```
+
+#### 4. 结果汇总配置
+
+```python
+# 多运行结果平均配置
+summary_groups = [
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    # 其他数据集平均配置...
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['AIME2024-Aveage8', 'naive_average'],
+        # 其他数据集指标...
+    ],
+    summary_groups=summary_groups
+)
+
+# 工作目录设置
+work_dir = "outputs/deepseek_r1_reasoning"
+```
+
+## 执行评测
+
+### 场景1：模型1卡加载，数据1个worker评测，共使用1个GPU
+
+```bash
+opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
+```
+
+评测日志会在命令行输出。
+
+### 场景2：模型1卡加载，数据8个worker评测，共使用8个GPU
+
+需要修改配置文件中的infer配置，将num_worker设置为8
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+同时评测命令去掉`--debug`参数
+
+```bash
+opencompass example/eval_deepseek_r1.py --dump-eval-details
+```
+
+此模式下，OpenCompass将使用多线程启动`$num_worker`个任务，命令行不展示具体日志，具体的评测日志将会在`$work_dir`下中展示。
+
+### 场景3：模型2卡加载，数据4个worker评测，共使用8个GPU
+
+需要注意模型配置中，`run_cfg`中的`num_gpus`需要设置为2(如使用推理后端，则推理后端的参数也需要同步修改，比如LMDeploy中的tp需要设置为2)，同时修改`infer`配置中的`num_worker`为4
+
+```python
+models += [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=128,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+```
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+### 评测结果
+
+评测结果展示如下：
+
+```bash
+dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
+
+```
+
+## 性能基线参考
+
+由于模型使用Sampling进行解码，同时AIME数据量较小，使用8次评测取平均情况下，仍会出现1-3分的性能抖动
+
+| 模型                         | 数据集   | 指标     | 数值 |
+| ---------------------------- | -------- | -------- | ---- |
+| DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3 |
+| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2 |
+| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2 |
--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。
+- `--dump-eval-details`: 默认开启，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。如不需要开启，需设置`--dump-eval-details False`。

 以运行模式 `-m all` 为例，整体运行流如下：

--- a/examples/eval_academic_leaderboard_202502.py
+++ b/examples/eval_academic_leaderboard_202502.py
@ -0,0 +1,137 @@
+# flake8: noqa
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    # Knowledge
+    # Math
+    from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
+        aime2024_datasets
+    from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
+        bbh_datasets
+    # General Reasoning
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
+        humaneval_datasets
+    # Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
+        ifeval_datasets
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
+        LCBCodeGeneration_dataset
+    from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
+        math_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets
+    # Model List
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+# Only take LCB generation for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
+               []) + [LCBCodeGeneration_dataset]
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict()
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
+    if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
+        dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['bbh', 'naive_average'],
+            ['math_prm800k_500', 'accuracy'],
+            ['aime2024', 'accuracy'],
+            ['GPQA_diamond', 'accuracy'],
+            ['mmlu_pro', 'naive_average'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['lcb_code_generation', 'pass@1'],
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
+        ['bbh', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        '',
+        'Math Calculation',
+        ['math_prm800k_500', 'accuracy'],
+        ['aime2024', 'accuracy'],
+        '',
+        'Knowledge',
+        ['mmlu_pro', 'naive_average'],
+        '',
+        'Code',
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['lcb_code_generation', 'pass@1'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+work_dir = './outputs/oc_academic_202502'
--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
@ -0,0 +1,127 @@
+
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+
+with read_base():
+    # Datasets, Summarizer
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+########################## Evaluator  #################################
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+llm_judge_evaluator =   dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        ),
+        judge_cfg=dict(),
+    )
+
+rule_evaluator =dict(type=MATHEvaluator)
+cascade_evaluator = dict(type=CascadeEvaluator,
+                   llm_evaluator=llm_judge_evaluator,
+                   rule_evaluator=rule_evaluator,
+                   parallel=False
+                   )
+########################## #################################
+eval_cfg = dict()
+
+# eval_cfg['evaluator'] = rule_evaluator
+# eval_cfg['evaluator'] = llm_judge_evaluator
+eval_cfg['evaluator'] = cascade_evaluator 
+
+math_datasets = [
+    dict(
+        abbr='math_prm800k_500',
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+
+datasets = math_datasets
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+
+work_dir = 'math_prm800k_500_cascade_evaluator'
--- a/examples/eval_deepseek_r1.py
+++ b/examples/eval_deepseek_r1.py
@ -0,0 +1,212 @@
+# Support AIME-2024 with Repeat8
+# Support MATH-500
+# Support OlympiadBench
+# Support OmniMath
+# Support LiveMathBench-202412-Hard
+
+import os.path as osp
+from itertools import product
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.runners import LocalRunner
+from opencompass.models import (
+    TurboMindModelwithChatTemplate,
+)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+with read_base():
+    # You can comment out the datasets you don't want to evaluate
+
+    # Datasets
+    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
+    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
+    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
+    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
+    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
+
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# Set LLM Verifier used for each dataset
+
+verifier_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
+        key='sk-1234', # You need to set your own API key
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1', # You need to set your own API base
+        ],
+        meta_template=dict(
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ], 
+        ),
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        # max_seq_len=32768,
+        max_seq_len=49152,
+)
+
+for item in datasets:
+    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+
+
+#######################################################################
+#                          PART 2  Model List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+models += [
+    # You can comment out the models you don't want to evaluate
+    # All models use sampling mode
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=32768),
+    #     max_seq_len=32768,
+    #     max_out_len=32768,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=2),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=16384),
+    #     max_seq_len=32768,
+    #     max_out_len=16384,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=4),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+]
+
+#######################################################################
+#                          PART 3  Inference/Evaluation               #
+#######################################################################
+
+# Inference configuration
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=1
+        # Similar with data-parallelism, how many workers for evaluation,
+        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
+        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
+        # to max-utilize the GPUs.
+        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner, n=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(
+            type=OpenICLEvalTask)
+    ),
+)
+
+
+#######################################################################
+#                          PART 4  Summarizer                         #
+#######################################################################
+
+
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+
+summary_groups.extend([
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    {
+        'name': 'LiveMathBench-v202412-Hard-Aveage8',
+        'subsets':[[
+            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] 
+                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
+        ]
+    }
+])
+
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        # ['LiveMathBench-k1-n1', 'pass@1'],
+        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
+        # ['aime2024', 'accuracy'],
+        ['math_prm800k_500-llmjudge', 'accuracy'],
+        ['AIME2024-Aveage8', 'naive_average'],
+        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['OmniMath', 'accuracy'],
+    ],
+    summary_groups=summary_groups,
+)
+
+
+#######################################################################
+#                          PART 5  Utils                              #
+#######################################################################
+
+work_dir = 'outputs/deepseek_r1_reasoning'
+
+
--- a/examples/eval_dingo.py
+++ b/examples/eval_dingo.py
@ -1,7 +1,7 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.dingo.dingo_gen import datasets
-    from .models.hf_internlm.hf_internlm_7b import models
+    from opencompass.configs.datasets.dingo.dingo_gen import datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models

 work_dir = './outputs/eval_dingo'
--- a/examples/eval_llm_judge.py
+++ b/examples/eval_llm_judge.py
@ -0,0 +1,116 @@
+from mmengine.config import read_base
+from opencompass.models.openai_api import OpenAISDK
+
+# Import pre-configured models from OpenCompass
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,
+    )
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import CustomDataset
+
+
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration using LLM as judge
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/math',
+            file_name='test_prm800k_500.jsonl',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.jsonl',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# Model to be evaluated
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+# Limiting test to first 8 examples for quick testing
+math_reader_cfg['test_range'] = '[0:8]'
+
+# Output directory
+work_dir = 'outputs/llm_judge'
--- a/examples/eval_math_verify.py
+++ b/examples/eval_math_verify.py
@ -0,0 +1,77 @@
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+with read_base():
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(
+            top_k=1,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+]
+
+datasets = [*math_datasets]
+
+
+work_dir = './outputs/math_500'
--- a/examples/eval_simpleqa.py
+++ b/examples/eval_simpleqa.py
@ -36,8 +36,8 @@ infer = dict(
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        models=[gpt_4o_2024_05_13_model],
-        judge_models=[gpt_4o_2024_05_13_model],
+        models=models,
+        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=256,
--- a/opencompass/init.py
+++ b/opencompass/init.py
@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.4.2'
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@ -12,7 +12,8 @@ from mmengine.config import Config, DictAction
 from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
 from opencompass.runners import SlurmRunner
 from opencompass.summarizers import DefaultSummarizer
-from opencompass.utils import LarkReporter, get_logger
+from opencompass.utils import (LarkReporter, get_logger, read_from_station,
+                               save_to_station)
 from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
                                   get_config_from_arg)

@ -118,8 +119,11 @@ def parse_args():
    parser.add_argument(
        '--dump-eval-details',
        help='Whether to dump the evaluation details, including the '
-        'correctness of each sample, bpb, etc.',
-        action='store_true',
+        'correctness of each sample, bpb, etc. Defaults to True.',
+        nargs='?',
+        const=True,
+        default=True,
+        type=lambda x: False if x and x.lower() == 'false' else True
    )
    parser.add_argument(
        '--dump-extract-rate',
@ -127,6 +131,27 @@ def parse_args():
        'correctness of each sample, bpb, etc.',
        action='store_true',
    )
+
+    parser.add_argument('-sp',
+        '--station-path',
+        help='Path to your results station.',
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument('--station-overwrite',
+        help='Whether to overwrite the results at station.',
+        action='store_true',
+    )
+
+    parser.add_argument(
+        '--read-from-station',
+        help='Whether to save the evaluation results to the '
+             'data station.',
+        action='store_true',
+    )
+
+
    # set srun args
    slurm_parser = parser.add_argument_group('slurm_args')
    parse_slurm_args(slurm_parser)
@ -177,8 +202,6 @@ def parse_dlc_args(dlc_parser):
                            type=str)


-
-
 def parse_hf_args(hf_parser):
    """These args are all for the quick construction of HuggingFace models."""
    hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
@ -213,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):

 def main():
    args = parse_args()
-
    if args.num_gpus is not None:
        raise ValueError('The `--num-gpus` argument is deprecated, please use '
                         '`--hf-num-gpus` to describe number of gpus used for '
@ -243,9 +265,11 @@ def main():
        else:
            dir_time_str = args.reuse
        logger.info(f'Reusing experiements from {dir_time_str}')
-    elif args.mode in ['eval', 'viz']:
-        raise ValueError('You must specify -r or --reuse when running in eval '
-                         'or viz mode!')
+    elif args.mode in ['eval', 'viz'] and not args.read_from_station:
+        raise ValueError(
+            'You must specify -r or --reuse, or you have to specify '
+            '--read-from-station and --station-path when running in eval '
+            'or viz mode!')

    # update "actual" work_dir
    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
@ -262,6 +286,12 @@ def main():
    # types cannot be serialized
    cfg = Config.fromfile(output_config_path, format_python_code=False)

+    # get existed results from station
+    if args.read_from_station:
+        existing_results_list = read_from_station(cfg, args)
+        rs_exist_results = [comb['combination'] for comb in existing_results_list]
+        cfg['rs_exist_results'] = rs_exist_results
+
    # report to lark bot if specify --lark
    if not args.lark:
        cfg['lark_bot_url'] = None
@ -269,6 +299,7 @@ def main():
        content = f'{getpass.getuser()}\'s task has been launched!'
        LarkReporter(cfg['lark_bot_url']).post(content)

+    # infer
    if args.mode in ['all', 'infer']:
        # When user have specified --slurm or --dlc, or have not set
        # "infer" in config, we will provide a default configuration
@ -321,6 +352,9 @@ def main():
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            fill_eval_cfg(cfg, args)
        if args.dump_eval_details:
+            logger.warning('Default to dump eval details, it might take extra'
+                        'space to save all the evaluation details. '
+                        'Set --dump-eval-details False to skip the details dump')
            cfg.eval.runner.task.dump_details = True
        if args.dump_extract_rate:
            cfg.eval.runner.task.cal_extract_rate = True
@ -350,6 +384,10 @@ def main():
        else:
            runner(tasks)

+    # save to station
+    if args.station_path is not None or cfg.get('station_path') is not None:
+        save_to_station(cfg, args)
+
    # visualize
    if args.mode in ['all', 'eval', 'viz']:
        summarizer_cfg = cfg.get('summarizer', {})
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
@ -0,0 +1,56 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
+
+
+# The system_prompt defines the initial instructions for the model, 
+# setting the context for solving ARC tasks.
+system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
+
+# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
+# guiding the model to learn the rule and apply it to solve the given puzzle.
+user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
+----------------------------------------
+{training_data}
+----------------------------------------
+Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
+----------------------------------------
+[{{'input': {input_test_data}, 'output': [[]]}}]
+----------------------------------------
+What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
+
+
+arc_prize_public_evaluation_reader_cfg = dict(
+    input_columns=['training_data', 'input_test_data'], 
+    output_column='output_test_data'
+)
+
+arc_prize_public_evaluation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='SYSTEM',fallback_role='HUMAN', prompt=system_prompt),
+                dict(role='HUMAN', prompt=user_message_template),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+arc_prize_public_evaluation_eval_cfg = dict(
+    evaluator=dict(type=ARCPrizeEvaluator)
+)
+
+arc_prize_public_evaluation_datasets = [
+    dict(
+        abbr='ARC_Prize_Public_Evaluation',
+        type=ARCPrizeDataset,
+        path='opencompass/arc_prize_public_evaluation',
+        reader_cfg=arc_prize_public_evaluation_reader_cfg,
+        infer_cfg=arc_prize_public_evaluation_infer_cfg,
+        eval_cfg=arc_prize_public_evaluation_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
+++ b/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
@ -0,0 +1,45 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ('Multiple-choice_Questions', MCQ_prompts),
+    ('Fill-in-the-blank_Questions', FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        infer_cfg = {
+            'ice_template': {
+                'type': PromptTemplate,
+                'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
+                'ice_token': '</E>',
+            },
+            'retriever': {'type': ZeroRetriever},
+            'inferencer': {'type': GenInferencer},
+        }
+        eval_cfg = {
+            'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
+            'pred_role': 'BOT',
+        }
+        _base_path = 'opencompass/GAOKAO-BENCH'
+        dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + p['keyword'],
+            'path': _base_path,
+            'filename': '/' + folder + '/' + p['keyword'] + '.json',
+            'name': p['keyword'],
+            'reader_cfg': reader_cfg,
+            'infer_cfg': infer_cfg,
+            'eval_cfg': eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
--- a/opencompass/configs/datasets/HLE/hle_gen.py
+++ b/opencompass/configs/datasets/HLE/hle_gen.py
@ -0,0 +1,5 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Default use LLM as a judge
+    from .hle_llmverify_gen_6ff468 import hle_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
+++ b/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
@ -0,0 +1,91 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import HLEDataset
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=HLEDataset,
+            path='cais/hle',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+hle_datasets = [
+    dict(
+        type=HLEDataset,
+        abbr='hle_llmjudge',
+        path='cais/hle',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/IFEval/IFEval_gen.py
+++ b/opencompass/configs/datasets/IFEval/IFEval_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .IFEval_gen_3321a3 import ifeval_datasets  # noqa: F401, F403
+    from .IFEval_gen_353ae7 import ifeval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
+++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, math_postprocess_v2
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 0
+# Generate reasoning path or not, only for single choice
+with_reasoning = True
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=math_postprocess_v2)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        mathbench_datasets.append(
+            dict(
+                abbr='mathbench-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/mathbench_v1/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
@ -0,0 +1,57 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedXpertQAEvaluator),
+    pred_role='BOT',
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
@ -0,0 +1,104 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedXpertQADataset,
+            path='TsinghuaC3I/MedXpertQA',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
+    ),
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/OlymMATH/README.md
+++ b/opencompass/configs/datasets/OlymMATH/README.md
@ -0,0 +1,60 @@
+# OlymMATH
+[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
+
+Dataset OlymMATH, please refer to the paper:
+Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
+
+
+## How to eval OlymMATH with model judge
+This is a simple example:
+```python
+
+from opencompass.models import OpenAISDK, OpenAI
+from mmengine.config import read_base
+
+
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
+    from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
+
+##################  Judge Config  ##################
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+judge_cfg = dict(
+    # An API model with OpenAI API format is required for Judge
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+)
+
+##################  Model Config  ##################
+models = [*qwen2_5_7b_instruct_model]
+
+##################  Dataset Config  ##################
+datasets = [*olymmath_datasets]
+
+# Set judge_cfg for evaluation
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+
+work_dir = './outputs/olymmath_llm_eval'
+```
--- a/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
@ -0,0 +1,5 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Default use LLM as a judge
+    from .olymmath_llmverify_gen_97b203 import olymmath_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import OlymMATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+
+olymmath_datasets = []
+
+for sub_set in sub_sets:
+    math_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlymMATHDataset,
+                path='RUC-AIBOX/OlymMATH',
+                reader_cfg=math_reader_cfg,
+                subset=sub_set,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olymmath_datasets.append(
+        dict(
+            type=OlymMATHDataset,
+            abbr=f'olymmath_llmjudge_{sub_set}',
+            path='RUC-AIBOX/OlymMATH',
+            reader_cfg=math_reader_cfg,
+            infer_cfg=math_infer_cfg,
+            eval_cfg=math_eval_cfg,
+            subset=sub_set,
+        )
+    )
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
@ -0,0 +1,105 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import math_categories as categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbenchMath_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlympiadBenchDataset,
+                path='opencompass/OlympiadBench',
+                name=_name,
+                reader_cfg=olympiadbench_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olympiadbenchMath_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+        )
+    )
+
+del _name
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
@ -0,0 +1,109 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbench_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # olympiadbench_eval_cfg = dict(
+    #     evaluator=dict(type=OlympiadBenchEvaluator, version='v2'), 
+    #     pred_postprocessor=dict(type=olympiadbench_postprocess_v2),
+    # )
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlympiadBenchDataset,
+                path='opencompass/OlympiadBench',
+                name=_name,
+                reader_cfg=olympiadbench_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olympiadbench_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+        )
+    )
+
+del _name
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
@ -5,3 +5,14 @@ categories = [
    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
 ]
+
+math_categories = [
+    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
+]
+
+physics_categories = [
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
@ -0,0 +1,98 @@
+# flake8: noqa
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN',
+                     prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=GRADER_TEMPLATE),
+                ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
+                                metric_name='accuracy'),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .aime2024_gen_6e39a4 import aime2024_datasets  # noqa: F401, F403
+    from .aime2024_gen_17d799 import aime2024_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
@ -0,0 +1,40 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import Aime2024Dataset
+
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
@ -0,0 +1,90 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    )
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
@ -0,0 +1,96 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr=f'aime2024-run{idx}',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+    for idx in range(16)
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
@ -0,0 +1,96 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr=f'aime2024-run{idx}',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+    for idx in range(8)
+]
--- a/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
@ -0,0 +1,90 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2025_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2025_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/aime2025',
+            reader_cfg=aime2025_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+aime2025_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='aime2025',
+        path='opencompass/aime2025',
+        reader_cfg=aime2025_reader_cfg,
+        infer_cfg=aime2025_infer_cfg,
+        eval_cfg=aime2025_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/bbeh/README.md
+++ b/opencompass/configs/datasets/bbeh/README.md
@ -0,0 +1,26 @@
+# BB#H
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
+python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
+```
+
+## Models
+
+|                   model                    | score |
+|:-----------------------------------------:|------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
+
+### Details
+
+|                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
+|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
+
+|                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
+|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
+
+|                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
+|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
--- a/opencompass/configs/datasets/bbeh/bbeh_gen.py
+++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py
@ -0,0 +1,93 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
+
+bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+
+bbeh_multiple_choice_sets = [
+    'bbeh_boolean_expressions',
+    'bbeh_disambiguation_qa',
+    'bbeh_geometric_shapes',
+    'bbeh_hyperbaton',
+    'bbeh_movie_recommendation',
+    'bbeh_nycc',
+    'bbeh_shuffled_objects',
+]
+
+bbeh_free_form_sets = [
+    'bbeh_boardgame_qa',
+    'bbeh_buggy_tables',
+    'bbeh_causal_understanding',
+    'bbeh_dyck_languages',
+    'bbeh_linguini',
+    'bbeh_multistep_arithmetic',
+    'bbeh_object_counting',
+    'bbeh_object_properties',
+    'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning',
+    'bbeh_sportqa',
+    'bbeh_temporal_sequence',
+    'bbeh_time_arithmetic',
+    'bbeh_web_of_lies',
+    'bbeh_word_sorting',
+    'bbeh_zebra_puzzles',
+]
+
+bbeh_datasets = []
+for _name in bbeh_multiple_choice_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(
+        evaluator=dict(type=BBEHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbeh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
+
+for _name in bbeh_free_form_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
+++ b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
@ -0,0 +1,126 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BBEHDataset,
+    generic_llmjudge_postprocess,
+)
+from opencompass.evaluator import GenericLLMEvaluator
+
+bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+
+bbeh_multiple_choice_sets = [
+    'bbeh_boolean_expressions',
+    'bbeh_disambiguation_qa',
+    'bbeh_geometric_shapes',
+    'bbeh_hyperbaton',
+    'bbeh_movie_recommendation',
+    'bbeh_nycc',
+    'bbeh_shuffled_objects',
+]
+
+bbeh_free_form_sets = [
+    'bbeh_boardgame_qa',
+    'bbeh_buggy_tables',
+    'bbeh_causal_understanding',
+    'bbeh_dyck_languages',
+    'bbeh_linguini',
+    'bbeh_multistep_arithmetic',
+    'bbeh_object_counting',
+    'bbeh_object_properties',
+    'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning',
+    'bbeh_sportqa',
+    'bbeh_temporal_sequence',
+    'bbeh_time_arithmetic',
+    'bbeh_web_of_lies',
+    'bbeh_word_sorting',
+    'bbeh_zebra_puzzles',
+]
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+bbeh_datasets = []
+for _name in bbeh_multiple_choice_sets + bbeh_free_form_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: ",
+                    )
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    bbeh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=BBEHDataset,
+                path='opencompass/bbeh',
+                name=_name,
+                abbr=_name,
+                reader_cfg=bbeh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg,
+            eval_cfg=bbeh_eval_cfg,
+        )
+    )
--- a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
@ -0,0 +1,189 @@
+# flake8: noqa
+
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import BBHDataset
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
+
+# For zero shot inference in bbh
+bbh_datasets = []
+for _name in bbh_sets:
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy())
+        )
+
+
+# For original 3 shot inference in bbh
+bbh_3_shot_datasets = []
+for _name in bbh_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_3_shot_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, F403
+    from .bbh_gen_ee62e9 import bbh_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
@ -0,0 +1,99 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
+++ b/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
@ -0,0 +1,189 @@
+# flake8: noqa
+
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import BBHDataset
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
+
+# For zero shot inference in bbh
+bbh_datasets = []
+for _name in bbh_sets:
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy())
+        )
+
+
+# For original 3 shot inference in bbh
+bbh_3_shot_datasets = []
+for _name in bbh_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_3_shot_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)

 bigcodebench_full_reader_cfg = dict(
-        input_columns=['complete_prompt'],
-        output_column='test',
+    input_columns=['complete_prompt'],
+    output_column='test',
 )

-
-bigcodebench_full_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{complete_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024)
-)
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=1024))

 bigcodebench_full_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='full',
    ),
    pred_role='BOT',
 )

 bigcodebench_full_complete_datasets = [
-    dict(
-        abbr='bigcodebench_full_complete',
-        type=BigCodeBenchDataset,
-        path='opencompass/bigcodebench',
-        reader_cfg=bigcodebench_full_reader_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
-        release_version='v0.1.2'
-    )
-]
+    dict(abbr='bigcodebench_full_complete',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)

 bigcodebench_full_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
-        output_column='test',
+    input_columns=['instruct_prompt'],
+    output_column='test',
 )

-
-bigcodebench_full_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=8192)
-)
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=8192))

 bigcodebench_full_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='full',
    ),
    pred_role='BOT',
 )

 bigcodebench_full_instruct_datasets = [
-    dict(
-        abbr='bigcodebench_full_instruct',
-        type=BigCodeBenchDataset,
-        path='opencompass/bigcodebench',
-        reader_cfg=bigcodebench_full_reader_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
-        release_version='v0.1.2'
-    )
-]
+    dict(abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
@ -0,0 +1,7 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets
+    from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets
+
+bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), [])
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['complete_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer))
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_complete_datasets = [
+    dict(
+        abbr='bigcodebench_hard_complete',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)

 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['complete_prompt'],
-        output_column='test',
+    input_columns=['complete_prompt'],
+    output_column='test',
 )

-
-bigcodebench_hard_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{complete_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024)
-)
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=1024))

 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
@ -51,4 +43,4 @@ bigcodebench_hard_complete_datasets = [
        release_version='v0.1.2',
        dataset_version='hard',
    )
-]
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
+    from .bigcodebench_hard_instruct_gen_c3d5ad import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)

 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
-        output_column='test',
+    input_columns=['instruct_prompt'],
+    output_column='test',
 )

-
-bigcodebench_hard_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=8192)
-)
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=8192))

 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
@ -51,4 +43,4 @@ bigcodebench_hard_instruct_datasets = [
        release_version='v0.1.2',
        dataset_version='hard',
    )
-]
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+       retriever=dict(type=ZeroRetriever),
+       inferencer=dict(type=GenInferencer)
+)
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
--- a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, F403
+    from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
@ -0,0 +1,185 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教',
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix + QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=CMMLUDataset,
+                path='opencompass/cmmlu',
+                name=_name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split='test',
+                ),
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test',
+            ),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        )
+    )
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
@ -0,0 +1,39 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
+
+
+cmo_fib_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+cmo_fib_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\n请一步一步地推理，并将最终答案写入\\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+cmo_fib_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
+)
+
+cmo_fib_datasets = [
+    dict(
+        abbr='cmo_fib',
+        type=CMOFibDataset,
+        path='opencompass/cmo_fib',
+        reader_cfg=cmo_fib_reader_cfg,
+        infer_cfg=cmo_fib_infer_cfg,
+        eval_cfg=cmo_fib_eval_cfg
+    )
+]
--- a/opencompass/configs/datasets/drop/drop_gen.py
+++ b/opencompass/configs/datasets/drop/drop_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets
+    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
+++ b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .drop_llmjudge_gen_3857b0 import drop_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
+++ b/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
@ -0,0 +1,89 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DropOpenAIDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .drop_examples import drop_examples  # noqa: F401, F403
+
+drop_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answers',
+    train_split='validation',
+    test_split='validation',
+)
+
+template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {prompt}\n \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answers}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+drop_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[dict(role='HUMAN', prompt=template)]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+drop_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=DropOpenAIDataset,
+            path='data/drop_simple_eval/dev.jsonl',
+            reader_cfg=drop_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+drop_datasets = [
+    dict(
+        abbr='drop',
+        type=DropOpenAIDataset,
+        path='data/drop_simple_eval/dev.jsonl',
+        reader_cfg=drop_reader_cfg,
+        infer_cfg=drop_infer_cfg,
+        eval_cfg=drop_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/gpqa/gpqa_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .gpqa_0shot_nocot_genericllmeval_gen_772ea0 import gpqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+from opencompass.datasets import MATHEvaluator, math_postprocess_v2
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
+)
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='opencompass/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .hellaswag_gen_6faab5 import hellaswag_datasets  # noqa: F401, F403
+    from .hellaswag_10shot_gen_e42710 import hellaswag_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
@ -0,0 +1,97 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import HellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+hellaswag_reader_cfg = dict(
+    input_columns=['ctx', 'A', 'B', 'C', 'D'],
+    output_column='label',
+    train_split='train',
+    test_split='val',
+)
+
+align_prompt = """Continue the following text without adding any additional information or formatting:
+{ctx}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+What is the right option?'"""
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {ctx}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+hellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=align_prompt),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+hellaswag_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=HellaswagDatasetwithICE,
+            path='opencompass/hellaswag_ice',
+            reader_cfg=hellaswag_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+hellaswag_datasets = [
+    dict(
+        abbr='hellaswag',
+        type=HellaswagDatasetwithICE,
+        path='opencompass/hellaswag_ice',
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_gen.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, F403
+    from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/korbench/korbench_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .korbench_single_0_shot_gen import korbench_0shot_single_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets  # noqa: F401, F403
--- a/Show More
+++ b/Show More