Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2025-04-09 16:26:07 +08:00 · 2025-04-09 16:26:07 +08:00 · 975e4bcadf
commit 975e4bcadf
parent 954e37e1ee 12213207b6
260 changed files with 14112 additions and 2840 deletions
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -25,8 +25,8 @@ models = [
        type=OpenAISDK,
        key='EMPTY',
        openai_api_base='http://localhost:23333/v1',
-        path='internlm2',
+        path='internlm3',
-        tokenizer_path='internlm/internlm2_5-7b-chat',
+        tokenizer_path='internlm/internlm3-8b-instruct',
        rpm_verbose=True,
        meta_template=api_meta_template,
        query_per_second=128,
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -11,18 +11,10 @@ with read_base():
    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
        winogrande_datasets  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.chatglm.hf_glm4_9b import \
        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
        models as hf_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
        models as hf_deepseek_67b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -49,12 +41,6 @@ with read_base():
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
        models as hf_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
        models as hf_internlm2_base_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
        models as hf_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@ -65,14 +51,14 @@ with read_base():
        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
        models as hf_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -15,14 +15,24 @@ with read_base():
        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
-        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
+        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
-        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
+        models as \
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
+        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
-        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
        models as \
        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
        models as \
        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
        models as \
        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@ -45,6 +55,8 @@ with read_base():
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@ -57,6 +69,8 @@ with read_base():
        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@ -83,10 +97,6 @@ with read_base():
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
@ -95,14 +105,19 @@ with read_base():
    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
        models as \
        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
        models as \
        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
+    from opencompass.configs.models.phi.hf_phi_4 import \
-        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+        models as hf_phi_4_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@ -142,6 +157,8 @@ with read_base():
    from ...volc import infer as volc_infer  # noqa: F401, E501
 hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -175,10 +175,11 @@ class TestApibench:
 class TestVolcFullbench:
    """Test cases for chat model."""
-    @pytest.mark.parametrize(
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
-        'model, dataset',
+        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
+        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
-         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
    ] for p2 in dataset_list(p1, 'objective')])
    @pytest.mark.chat_objective
    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
@ -245,10 +246,7 @@ class TestCmdCase:
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                              ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -260,9 +258,9 @@ class TestCmdCase:
        [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -280,13 +278,25 @@ class TestCmdCase:
    @pytest.mark.case4
    @pytest.mark.parametrize(
-        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
+        'model, dataset',
-                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
+        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(model, result_score, base_score, dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
    @pytest.mark.case5
    @pytest.mark.parametrize(
        'model, dataset',
        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
        assert_score(model + '_batch', result_score, base_score, dataset)
 def assert_score(model_type, score, baseline, dataset: str = ''):
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
    race-middle_accuracy: 91.78
    race-high_accuracy: 90.02
 internlm2-1.8b-hf:
    demo_gsm8k_accuracy: 15.62
    race-middle_accuracy: 71.66
    race-high_accuracy: 66.38
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 89.06
+    demo_gsm8k_accuracy: 87.50
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.54
-internlm2-chat-1.8b-lmdeploy:
+internlm3-8b-instruct-lmdeploy:
-    demo_gsm8k_accuracy: 31
+    demo_gsm8k_accuracy: 73.44
-    race-middle_accuracy: 81.34
+    race-middle_accuracy: 93.38
-    race-high_accuracy: 73.96
+    race-high_accuracy: 90.34
 internlm3-8b-instruct_hf-lmdeploy:
    demo_gsm8k_accuracy: 73.44
    race-middle_accuracy: 93.38
    race-high_accuracy: 90.34
 internlm3-8b-instruct_hf-vllm:
    demo_gsm8k_accuracy: 81.25
    race-middle_accuracy: 92.20
    race-high_accuracy: 89.88
 internlm2_5-7b-chat_hf:
    demo_gsm8k_accuracy: 87.50
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
    race-high_accuracy: 90.48
 lmdeploy-api-test:
-    gsm8k_accuracy: 68.75
+    gsm8k_accuracy: 56.25
-    race-middle_accuracy: 87.50
+    race-middle_accuracy: 93.75
    race-high_accuracy: 93.75
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
        drop_accuracy: 81.25
        GPQA_diamond_accuracy: 25
        hellaswag_accuracy: 87.5
-        TheoremQA_score: 18.75
+        TheoremQA_score: 12.50
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
        gsm8k_accuracy: 62.50
@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
        lcb_test_output_pass@1: 18.75
        bbh-logical_deduction_seven_objects_score: 50
        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 72.6
+        mmlu-other_accuracy: 72.6
-        cmmlu-china-specific_naive_average: 76.25
+        cmmlu-china-specific_accuracy: 76.25
        mmlu_pro_math_accuracy: 25
        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 20
+        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
-        CompassArena_naive_average: 44.00
+        CompassArena_naive_average: 43
        mtbench101_avg: 7.8
-        wildbench_average: -12.78
+        wildbench_average: -15.56
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_专业能力: 8.00
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -55,10 +55,10 @@ internlm2_5-7b-chat-hf_fullbench:
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 20
+        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
        compassarena_knowledge_naive_average: 55
-        compassarena_reason_v2_naive_average: 45.00
+        compassarena_reason_v2_naive_average: 40
        compassarena_math_v2_naive_average: 55
        compassarena_creationv2_zh_naive_average: 30
        followbench_llmeval_en_HSR_AVG: 1
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
 internlm2_5-7b-chat-turbomind_fullbench:
    objective:
        race-high_accuracy:  93.75
-        ARC-c_accuracy: 93.75
+        ARC-c_accuracy: 87.50
        BoolQ_accuracy: 68.75
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 56.25
-        drop_accuracy: 81.25
+        drop_accuracy: 75
        GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 81.25
+        hellaswag_accuracy: 87.5
-        TheoremQA_score: 6.25
+        TheoremQA_score: 12.5
        musr_average_naive_average: 39.58
-        korbench_single_naive_average: 37.50
+        korbench_single_naive_average: 40
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 62.5
-        math_accuracy: 68.75
+        math_accuracy: 75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
-        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 16.96
+        ds1000_naive_average: 17.86
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 25.00
+        lcb_test_output_pass@1: 18.75
-        bbh-logical_deduction_seven_objects_score: 50.00
+        bbh-logical_deduction_seven_objects_score: 56.25
-        bbh-multistep_arithmetic_two_score: 68.75
+        bbh-multistep_arithmetic_two_score: 75
-        mmlu-other_naive_average: 69.71
+        mmlu-other_accuracy: 72.6
-        cmmlu-china-specific_naive_average: 75.83
+        cmmlu-china-specific_accuracy: 78.33
        mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 0
+        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 18.75
+        ds1000_Scipy_accuracy: 25
        ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 6.25
        ds1000_Matplotlib_accuracy: 50.00
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.50
        college_knowledge_naive_average: 87.5
    subjective:
-        alignment_bench_v1_1_总分: 0.70
+        alignment_bench_v1_1_总分: 0.66
        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
-        CompassArena_naive_average: 38
+        CompassArena_naive_average: 40
-        mtbench101_avg: 7.80
+        mtbench101_avg: 8
-        wildbench_average: -4.86
+        wildbench_average: -6.81
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_专业能力: 7.9
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
        alignment_bench_v1_1_综合问答: 0
        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 50
+        compassarena_knowledge_naive_average: 45
-        compassarena_reason_v2_naive_average: 30
+        compassarena_reason_v2_naive_average: 25
-        compassarena_math_v2_naive_average: 50
+        compassarena_math_v2_naive_average: 60
-        compassarena_creationv2_zh_naive_average: 25
+        compassarena_creationv2_zh_naive_average: 35
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25
+        TheoremQA_score: 12.50
        winogrande_accuracy: 75
        gsm8k_accuracy: 37.5
        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25.00
+        TheoremQA_score: 12.50
        winogrande_accuracy: 87.5
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 56.25
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
        math_accuracy: 18.75
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 31.25
+        dingo_en_192_score: 50.00
        dingo_zh_170_score: 93.75
        mmlu-other_accuracy: 76.92
        cmmlu-china-specific_accuracy: 84.17
        mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 50
+        bbh-logical_deduction_seven_objects_score: 43.75
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
-        mmlu-stem_naive_average: 63.72
+        mmlu-stem_accuracy: 63.72
-        mmlu-social-science_naive_average: 80.15
+        mmlu-social-science_accuracy: 80.15
-        mmlu-humanities_naive_average: 74.27
+        mmlu-humanities_accuracy: 74.27
-        mmlu-other_naive_average: 71.85
+        mmlu-other_accuracy: 71.85
-        cmmlu-stem_naive_average: 67.07
+        cmmlu-stem_accuracy: 67.07
-        cmmlu-social-science_naive_average: 81.49
+        cmmlu-social-science_accuracy: 81.49
-        cmmlu-humanities_naive_average: 85.84
+        cmmlu-humanities_accuracy: 85.84
-        cmmlu-other_naive_average: 82.69
+        cmmlu-other_accuracy: 82.69
-        cmmlu-china-specific_naive_average: 79.88
+        cmmlu-china-specific_accuracy: 79.88
        mmlu_pro_biology_accuracy: 58.58
        mmlu_pro_business_accuracy: 28.01
        mmlu_pro_chemistry_accuracy: 22.79
@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
        longbench_naive_average: 46.19
        longbench_zh_naive_average: 49.3
        longbench_en_naive_average: 43.97
-        longbench_single-document-qa_naive_average: 42.84
+        longbench_single-document-qa_score: 42.84
-        longbench_multi-document-qa_naive_average: 37.29
+        longbench_multi-document-qa_score: 41.25
-        longbench_summarization_naive_average: 23.21
+        longbench_summarization_score: 23.21
-        longbench_few-shot-learning_naive_average: 61.67
+        longbench_few-shot-learning_score: 61.67
-        longbench_synthetic-tasks_naive_average: 60.05
+        longbench_synthetic-tasks_score: 60.05
-        longbench_code-completion_naive_average: 52.09
+        longbench_code-completion_score: 52.09
 internlm2_5-7b-chat-turbomind:
    objective:
@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
        teval_naive_average: 80
        SciCode_sub_accuracy: 5.56
        qa_dingo_cn_score: 99.01
-        mmlu-stem_naive_average: 68.2
+        mmlu-stem_accuracy: 68.2
-        mmlu-social-science_naive_average: 75.8
+        mmlu-social-science_accuracy: 75.8
-        mmlu-humanities_naive_average: 69.3
+        mmlu-humanities_accuracy: 69.3
-        mmlu-other_naive_average: 71.3
+        mmlu-other_accuracy: 71.3
-        cmmlu-stem_naive_average: 66.64
+        cmmlu-stem_accuracy: 66.64
-        cmmlu-social-science_naive_average: 76
+        cmmlu-social-science_accuracy: 76
-        cmmlu-humanities_naive_average: 77.9
+        cmmlu-humanities_accuracy: 77.9
-        cmmlu-other_naive_average: 77.25
+        cmmlu-other_accuracy: 77.25
-        cmmlu-china-specific_naive_average: 73.6
+        cmmlu-china-specific_accuracy: 73.6
        mmlu_pro_biology_accuracy: 66.67
        mmlu_pro_business_accuracy: 47.91
        mmlu_pro_chemistry_accuracy: 35
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
        openai_mmmlu_lite_DE-DE_accuracy: 51.27
        openai_mmmlu_lite_ES-LA_accuracy: 56.94
        openai_mmmlu_lite_FR-FR_accuracy: 58.22
-        openai_mmmlu_lite_HI-IN_accuracy: 33.75
+        openai_mmmlu_lite_HI-IN_accuracy: 30.75
        openai_mmmlu_lite_ID-ID_accuracy: 50.6
        openai_mmmlu_lite_IT-IT_accuracy: 50.6
        openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -391,10 +391,10 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_total: 25.96
        arenahard_score: 17.15
        Followbench_naive_average: 0.81
-        CompassArena_naive_average: 34.61
+        CompassArena_naive_average: 39.49
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
-        wildbench_average: -15.69
+        wildbench_average: -10.49
        simpleqa_accuracy_given_attempted: 0.04
        chinese_simpleqa_given_attempted_accuracy: 0.34
        alignment_bench_v1_1_专业能力: 6.05
@ -409,12 +409,12 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 25
+        alpaca_eval_vicuna: 33.75
-        compassarena_language_naive_average: 52.5
+        compassarena_language_naive_average: 58.50
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 19.91
+        compassarena_math_v2_naive_average: 25.95
-        compassarena_creationv2_zh_naive_average: 35.81
+        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
        followbench_llmeval_en_HSR_AVG: 0.73
@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
        babilong_32k_naive_average: 48.9
        babilong_128k_naive_average: 40.8
        babilong_256k_naive_average: 23.5
-        longbench_single-document-qa_naive_average: 43.56
+        longbench_single-document-qa_score: 43.56
-        longbench_multi-document-qa_naive_average: 46.24
+        longbench_multi-document-qa_score: 46.24
-        longbench_summarization_naive_average: 24.32
+        longbench_summarization_score: 24.32
-        longbench_few-shot-learning_naive_average: 51.67
+        longbench_few-shot-learning_score: 51.67
-        longbench_synthetic-tasks_naive_average: 66.83
+        longbench_synthetic-tasks_score: 66.83
-        longbench_code-completion_naive_average: 45.99
+        longbench_code-completion_score: 45.99
 qwen2.5-7b-instruct-turbomind:
    objective:
        race-high_accuracy: 84.99
        ARC-c_accuracy: 92.2
        BoolQ_accuracy: 86.7
        triviaqa_wiki_1shot_score: 53.06
        nq_open_1shot_score: 17.51
        mmmlu_lite_naive_average: 54.96
        IFEval_Prompt-level-strict-accuracy: 71.53
        drop_accuracy: 80.07
        bbh_naive_average: 68.81
        GPQA_diamond_accuracy: 34.34
        hellaswag_accuracy: 85.42
        TheoremQA_score: 18.38
        musr_average_naive_average: 43.44
        korbench_single_naive_average: 39.44
        ARC_Prize_Public_Evaluation_accuracy: 0
        gsm8k_accuracy: 92.57
        GaokaoBench_weighted_average: 80.14
        math_accuracy: 73.58
        cmo_fib_accuracy: 25
        aime2024_accuracy: 16.67
        Mathbench_naive_average: 77.33
        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
        cmmlu_naive_average: 75.97
        mmlu_naive_average: 76.01
        mmlu_pro_naive_average: 56.12
        openai_humaneval_humaneval_pass@1: 83.54
        sanitized_mbpp_score: 74.71
        humanevalx_naive_average: 48.29
        ds1000_naive_average: 18.66
        lcb_code_generation_pass@1: 39.5
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.68
        bigcodebench_hard_instruct_pass@1: 16.22
        bigcodebench_hard_complete_pass@1: 11.49
        teval_naive_average: 79.72
        SciCode_sub_accuracy: 10.76
        qa_dingo_cn_score: 99.01
        mmlu_accuracy: 76.01
        mmlu-stem_accuracy: 77.59
        mmlu-social-science_accuracy: 79.02
        mmlu-humanities_accuracy: 72.07
        mmlu-other_accuracy: 74.86
        cmmlu_accuracy: 75.97
        cmmlu-stem_accuracy: 73.09
        cmmlu-social-science_accuracy: 75.95
        cmmlu-humanities_accuracy: 76.53
        cmmlu-other_accuracy: 78.79
        cmmlu-china-specific_accuracy: 73.17
        mmlu_pro_accuracy: 56.12
        mmlu_pro_biology_accuracy: 71.41
        mmlu_pro_business_accuracy: 67.68
        mmlu_pro_chemistry_accuracy: 54.59
        mmlu_pro_computer_science_accuracy: 58.29
        mmlu_pro_economics_accuracy: 66.82
        mmlu_pro_engineering_accuracy: 42.41
        mmlu_pro_health_accuracy: 55.87
        mmlu_pro_history_accuracy: 46.46
        mmlu_pro_law_accuracy: 28.97
        mmlu_pro_math_accuracy: 73.13
        mmlu_pro_philosophy_accuracy: 44.89
        mmlu_pro_physics_accuracy: 58.43
        mmlu_pro_psychology_accuracy: 63.16
        mmlu_pro_other_accuracy: 53.57
        humanevalx-python_pass@1: 50
        humanevalx-cpp_pass@1: 42.07
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 74.39
        humanevalx-js_pass@1: 75
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 8.18
        ds1000_Tensorflow_accuracy: 17.78
        ds1000_Scipy_accuracy: 15.09
        ds1000_Sklearn_accuracy: 10.43
        ds1000_Pytorch_accuracy: 4.41
        ds1000_Matplotlib_accuracy: 60.65
        mmmlu_lite_accuracy: 54.96
        openai_mmmlu_lite_AR-XY_accuracy: 42.32
        openai_mmmlu_lite_BN-BD_accuracy: 42.25
        openai_mmmlu_lite_DE-DE_accuracy: 59.93
        openai_mmmlu_lite_ES-LA_accuracy: 66.53
        openai_mmmlu_lite_FR-FR_accuracy: 66.88
        openai_mmmlu_lite_HI-IN_accuracy: 49.26
        openai_mmmlu_lite_ID-ID_accuracy: 61.26
        openai_mmmlu_lite_IT-IT_accuracy: 65.47
        openai_mmmlu_lite_JA-JP_accuracy: 61.54
        openai_mmmlu_lite_KO-KR_accuracy: 60.28
        openai_mmmlu_lite_PT-BR_accuracy: 55.51
        openai_mmmlu_lite_SW-KE_accuracy: 36.42
        openai_mmmlu_lite_YO-NG_accuracy: 32.14
        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
        college_naive_average: 48
        high_naive_average: 59
        middle_naive_average: 78
        primary_naive_average: 85.67
        arithmetic_naive_average: 75.67
        mathbench-a (average)_naive_average: 69.27
        college_knowledge_naive_average: 83.86
        high_knowledge_naive_average: 80.29
        middle_knowledge_naive_average: 84.26
        primary_knowledge_naive_average: 93.16
        mathbench-t (average)_naive_average: 85.39
 internlm2_5-7b-chat-pytorch:
    objective:
        race-high_accuracy: 86.39
        ARC-c_accuracy: 90.51
        BoolQ_accuracy: 88.01
        triviaqa_wiki_1shot_score: 64.77
        nq_open_1shot_score: 22.71
        mmmlu_lite_naive_average: 45.02
        IFEval_Prompt-level-strict-accuracy: 56.56
        drop_accuracy: 75.46
        bbh_naive_average: 73.34
        GPQA_diamond_accuracy: 32.83
        hellaswag_accuracy: 94.81
        TheoremQA_score: 23.88
        musr_average_naive_average: 51.31
        korbench_single_naive_average: 32
        ARC_Prize_Public_Evaluation_accuracy: 0.01
        gsm8k_accuracy: 86.96
        GaokaoBench_weighted_average: 78.05
        math_accuracy: 60.34
        cmo_fib_accuracy: 12.98
        aime2024_accuracy: 3.33
        Mathbench_naive_average: 64.82
        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
        cmmlu_naive_average: 74.24
        mmlu_naive_average: 70.2
        mmlu_pro_naive_average: 45.39
        openai_humaneval_humaneval_pass@1: 70.12
        sanitized_mbpp_score: 64.59
        humanevalx_naive_average: 38.78
        ds1000_naive_average: 14.19
        lcb_code_generation_pass@1: 16.5
        lcb_code_execution_pass@1: 33.82
        lcb_test_output_pass@1: 22.62
        bigcodebench_hard_instruct_pass@1: 6.08
        bigcodebench_hard_complete_pass@1: 6.76
        teval_naive_average: 79.73
        SciCode_sub_accuracy: 3.47
        qa_dingo_cn_score: 100
        mmlu_accuracy: 70.2
        mmlu-stem_accuracy: 67.73
        mmlu-social-science_accuracy: 75.49
        mmlu-humanities_accuracy: 68.56
        mmlu-other_accuracy: 70.58
        cmmlu_accuracy: 74.24
        cmmlu-stem_accuracy: 66.7
        cmmlu-social-science_accuracy: 75.88
        cmmlu-humanities_accuracy: 77.56
        cmmlu-other_accuracy: 77.52
        cmmlu-china-specific_accuracy: 73.46
        mmlu_pro_accuracy: 45.39
        mmlu_pro_biology_accuracy: 65.83
        mmlu_pro_business_accuracy: 51.96
        mmlu_pro_chemistry_accuracy: 36.84
        mmlu_pro_computer_science_accuracy: 48.29
        mmlu_pro_economics_accuracy: 56.16
        mmlu_pro_engineering_accuracy: 29.1
        mmlu_pro_health_accuracy: 44.5
        mmlu_pro_history_accuracy: 42.26
        mmlu_pro_law_accuracy: 24.98
        mmlu_pro_math_accuracy: 54.85
        mmlu_pro_philosophy_accuracy: 39.28
        mmlu_pro_physics_accuracy: 37.41
        mmlu_pro_psychology_accuracy: 58.27
        mmlu_pro_other_accuracy: 45.78
        humanevalx-python_pass@1: 56.1
        humanevalx-cpp_pass@1: 20.73
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 59.15
        humanevalx-js_pass@1: 57.93
        ds1000_Pandas_accuracy: 8.93
        ds1000_Numpy_accuracy: 4.09
        ds1000_Tensorflow_accuracy: 11.11
        ds1000_Scipy_accuracy: 7.55
        ds1000_Sklearn_accuracy: 7.83
        ds1000_Pytorch_accuracy: 8.82
        ds1000_Matplotlib_accuracy: 50.97
        mmmlu_lite_accuracy: 45.02
        openai_mmmlu_lite_AR-XY_accuracy: 18.6
        openai_mmmlu_lite_BN-BD_accuracy: 27.58
        openai_mmmlu_lite_DE-DE_accuracy: 51.23
        openai_mmmlu_lite_ES-LA_accuracy: 56.63
        openai_mmmlu_lite_FR-FR_accuracy: 58.11
        openai_mmmlu_lite_HI-IN_accuracy: 33.82
        openai_mmmlu_lite_ID-ID_accuracy: 50.39
        openai_mmmlu_lite_IT-IT_accuracy: 50.39
        openai_mmmlu_lite_JA-JP_accuracy: 50.95
        openai_mmmlu_lite_KO-KR_accuracy: 45.05
        openai_mmmlu_lite_PT-BR_accuracy: 57.89
        openai_mmmlu_lite_SW-KE_accuracy: 32.14
        openai_mmmlu_lite_YO-NG_accuracy: 32.14
        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
        college_naive_average: 21
        high_naive_average: 47
        middle_naive_average: 59.67
        primary_naive_average: 76
        arithmetic_naive_average: 62
        mathbench-a (average)_naive_average: 53.13
        college_knowledge_naive_average: 68.99
        high_knowledge_naive_average: 70.06
        middle_knowledge_naive_average: 78.53
        primary_knowledge_naive_average: 88.49
        mathbench-t (average)_naive_average: 76.51
 qwen2.5-7b-instruct-pytorch:
    objective:
        race-high_accuracy: 85.16
        ARC-c_accuracy: 90.85
        BoolQ_accuracy: 86.61
        triviaqa_wiki_1shot_score: 52.96
        nq_open_1shot_score: 17.62
        mmmlu_lite_naive_average: 54.7
        IFEval_Prompt-level-strict-accuracy: 71.35
        drop_accuracy: 80.23
        bbh_naive_average: 68.88
        GPQA_diamond_accuracy: 36.36
        hellaswag_accuracy: 85.49
        TheoremQA_score: 18.38
        musr_average_naive_average: 43.3
        korbench_single_naive_average: 39.44
        ARC_Prize_Public_Evaluation_accuracy: 0
        gsm8k_accuracy: 91.66
        GaokaoBench_weighted_average: 80.02
        math_accuracy: 73.74
        cmo_fib_accuracy: 26.44
        aime2024_accuracy: 13.33
        Mathbench_naive_average: 77.08
        wikibench-wiki-single_choice_cncircular_perf_4: 34
        cmmlu_naive_average: 75.9
        mmlu_naive_average: 76.27
        mmlu_pro_naive_average: 56.14
        openai_humaneval_humaneval_pass@1: 84.76
        sanitized_mbpp_score: 74.71
        humanevalx_naive_average: 48.17
        ds1000_naive_average: 18.57
        lcb_code_generation_pass@1: 38.75
        lcb_code_execution_pass@1: 42.38
        lcb_test_output_pass@1: 50.45
        bigcodebench_hard_instruct_pass@1: 16.89
        bigcodebench_hard_complete_pass@1: 12.16
        teval_naive_average: 79.46
        SciCode_sub_accuracy: 10.42
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.27
        mmlu-stem_accuracy: 77.75
        mmlu-social-science_accuracy: 78.65
        mmlu-humanities_accuracy: 73.12
        mmlu-other_accuracy: 75.05
        cmmlu_accuracy: 75.9
        cmmlu-stem_accuracy: 73.41
        cmmlu-social-science_accuracy: 75.97
        cmmlu-humanities_accuracy: 76.42
        cmmlu-other_accuracy: 78.15
        cmmlu-china-specific_accuracy: 73.27
        mmlu_pro_accuracy: 56.14
        mmlu_pro_biology_accuracy: 72.25
        mmlu_pro_business_accuracy: 66.16
        mmlu_pro_chemistry_accuracy: 55.65
        mmlu_pro_computer_science_accuracy: 60.24
        mmlu_pro_economics_accuracy: 66.82
        mmlu_pro_engineering_accuracy: 41.38
        mmlu_pro_health_accuracy: 54.89
        mmlu_pro_history_accuracy: 46.46
        mmlu_pro_law_accuracy: 29.06
        mmlu_pro_math_accuracy: 73.58
        mmlu_pro_philosophy_accuracy: 44.89
        mmlu_pro_physics_accuracy: 60.05
        mmlu_pro_psychology_accuracy: 61.9
        mmlu_pro_other_accuracy: 52.6
        humanevalx-python_pass@1: 51.83
        humanevalx-cpp_pass@1: 42.68
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 73.78
        humanevalx-js_pass@1: 72.56
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 8.64
        ds1000_Tensorflow_accuracy: 17.78
        ds1000_Scipy_accuracy: 15.09
        ds1000_Sklearn_accuracy: 8.7
        ds1000_Pytorch_accuracy: 4.41
        ds1000_Matplotlib_accuracy: 61.29
        mmmlu_lite_accuracy: 54.7
        openai_mmmlu_lite_AR-XY_accuracy: 42.32
        openai_mmmlu_lite_BN-BD_accuracy: 42.18
        openai_mmmlu_lite_DE-DE_accuracy: 60
        openai_mmmlu_lite_ES-LA_accuracy: 66.18
        openai_mmmlu_lite_FR-FR_accuracy: 66.88
        openai_mmmlu_lite_HI-IN_accuracy: 48.63
        openai_mmmlu_lite_ID-ID_accuracy: 61.26
        openai_mmmlu_lite_IT-IT_accuracy: 65.26
        openai_mmmlu_lite_JA-JP_accuracy: 60.7
        openai_mmmlu_lite_KO-KR_accuracy: 60.63
        openai_mmmlu_lite_PT-BR_accuracy: 54.46
        openai_mmmlu_lite_SW-KE_accuracy: 36
        openai_mmmlu_lite_YO-NG_accuracy: 31.86
        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
        college_naive_average: 48.33
        high_naive_average: 59.33
        middle_naive_average: 76.67
        primary_naive_average: 86.67
        arithmetic_naive_average: 74.33
        mathbench-a (average)_naive_average: 69.07
        college_knowledge_naive_average: 83.54
        high_knowledge_naive_average: 80.82
        middle_knowledge_naive_average: 83.79
        primary_knowledge_naive_average: 92.22
        mathbench-t (average)_naive_average: 85.1
 internlm3-8b-instruct-turbomind:
    objective:
        race-high_accuracy: 89.22
        ARC-c_accuracy: 92.54
        BoolQ_accuracy: 86.45
        triviaqa_wiki_1shot_score: 60.72
        nq_open_1shot_score: 20.25
        mmmlu_lite_naive_average: 41.82
        IFEval_Prompt-level-strict-accuracy: 77.45
        drop_accuracy: 83.27
        bbh_naive_average: 55.22
        GPQA_diamond_accuracy: 37.88
        hellaswag_accuracy: 91.28
        TheoremQA_score: 20.12
        musr_average_naive_average: 36.86
        korbench_single_naive_average: 41.2
        ARC_Prize_Public_Evaluation_accuracy: 0.06
        gsm8k_accuracy: 91.28
        GaokaoBench_weighted_average: 86.59
        math_accuracy: 76.96
        cmo_fib_accuracy: 35.1
        aime2024_accuracy: 16.67
        Mathbench_naive_average: 78.96
        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
        cmmlu_naive_average: 83.33
        mmlu_naive_average: 76.21
        mmlu_pro_naive_average: 57.96
        openai_humaneval_humaneval_pass@1: 81.71
        sanitized_mbpp_score: 69.65
        humanevalx_naive_average: 40.73
        ds1000_naive_average: 27.23
        lcb_code_generation_pass@1: 34.75
        lcb_code_execution_pass@1: 49.9
        lcb_test_output_pass@1: 48.19
        bigcodebench_hard_instruct_pass@1: 13.51
        bigcodebench_hard_complete_pass@1: 15.54
        teval_naive_average: 82.86
        SciCode_sub_accuracy: 11.11
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.21
        mmlu-stem_accuracy: 77.7
        mmlu-social-science_accuracy: 80.98
        mmlu-humanities_accuracy: 70.83
        mmlu-other_accuracy: 75.01
        cmmlu_accuracy: 83.33
        cmmlu-stem_accuracy: 79.66
        cmmlu-social-science_accuracy: 83.39
        cmmlu-humanities_accuracy: 84.73
        cmmlu-other_accuracy: 86.2
        cmmlu-china-specific_accuracy: 81.77
        mmlu_pro_accuracy: 57.96
        mmlu_pro_biology_accuracy: 75.45
        mmlu_pro_business_accuracy: 64.64
        mmlu_pro_chemistry_accuracy: 59.81
        mmlu_pro_computer_science_accuracy: 60.24
        mmlu_pro_economics_accuracy: 68.6
        mmlu_pro_engineering_accuracy: 44.79
        mmlu_pro_health_accuracy: 58.31
        mmlu_pro_history_accuracy: 49.87
        mmlu_pro_law_accuracy: 32.43
        mmlu_pro_math_accuracy: 70.17
        mmlu_pro_philosophy_accuracy: 46.89
        mmlu_pro_physics_accuracy: 59.58
        mmlu_pro_psychology_accuracy: 66.29
        mmlu_pro_other_accuracy: 54.33
        humanevalx-python_pass@1: 43.9
        humanevalx-cpp_pass@1: 20.12
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 74.39
        humanevalx-js_pass@1: 65.24
        ds1000_Pandas_accuracy: 16.49
        ds1000_Numpy_accuracy: 34.09
        ds1000_Tensorflow_accuracy: 26.67
        ds1000_Scipy_accuracy: 17.92
        ds1000_Sklearn_accuracy: 20.87
        ds1000_Pytorch_accuracy: 19.12
        ds1000_Matplotlib_accuracy: 55.48
        mmmlu_lite_accuracy: 41.82
        openai_mmmlu_lite_AR-XY_accuracy: 32.56
        openai_mmmlu_lite_BN-BD_accuracy: 4.56
        openai_mmmlu_lite_DE-DE_accuracy: 24.91
        openai_mmmlu_lite_ES-LA_accuracy: 51.09
        openai_mmmlu_lite_FR-FR_accuracy: 61.68
        openai_mmmlu_lite_HI-IN_accuracy: 24.98
        openai_mmmlu_lite_ID-ID_accuracy: 44.56
        openai_mmmlu_lite_IT-IT_accuracy: 52.35
        openai_mmmlu_lite_JA-JP_accuracy: 51.02
        openai_mmmlu_lite_KO-KR_accuracy: 47.93
        openai_mmmlu_lite_PT-BR_accuracy: 53.89
        openai_mmmlu_lite_SW-KE_accuracy: 33.47
        openai_mmmlu_lite_YO-NG_accuracy: 33.47
        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
        college_naive_average: 45.67
        high_naive_average: 64.67
        middle_naive_average: 82.33
        primary_naive_average: 90.33
        arithmetic_naive_average: 74
        mathbench-a (average)_naive_average: 71.4
        college_knowledge_naive_average: 85.28
        high_knowledge_naive_average: 79.43
        middle_knowledge_naive_average: 87.9
        primary_knowledge_naive_average: 93.42
        mathbench-t (average)_naive_average: 86.51
 internlm3-8b-instruct-pytorch:
    objective:
        race-high_accuracy: 89.02
        ARC-c_accuracy: 93.56
        BoolQ_accuracy: 86.67
        triviaqa_wiki_1shot_score: 60.54
        nq_open_1shot_score: 20.3
        mmmlu_lite_naive_average: 42.6
        IFEval_Prompt-level-strict-accuracy: 79.11
        drop_accuracy: 83.32
        bbh_naive_average: 54.76
        GPQA_diamond_accuracy: 33.84
        hellaswag_accuracy: 91.31
        TheoremQA_score: 18
        musr_average_naive_average: 36.62
        korbench_single_naive_average: 41.84
        ARC_Prize_Public_Evaluation_accuracy: 0.06
        gsm8k_accuracy: 90.67
        GaokaoBench_weighted_average: 86.27
        math_accuracy: 76.68
        cmo_fib_accuracy: 33.65
        aime2024_accuracy: 10
        Mathbench_naive_average: 78.92
        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
        cmmlu_naive_average: 83.11
        mmlu_naive_average: 76.23
        mmlu_pro_naive_average: 58.16
        openai_humaneval_humaneval_pass@1: 82.32
        sanitized_mbpp_score: 70.04
        humanevalx_naive_average: 39.76
        ds1000_naive_average: 27.84
        lcb_code_generation_pass@1: 34.5
        lcb_code_execution_pass@1: 48.02
        lcb_test_output_pass@1: 47.74
        bigcodebench_hard_instruct_pass@1: 12.84
        bigcodebench_hard_complete_pass@1: 15.54
        teval_naive_average: 82.86
        SciCode_sub_accuracy: 9.38
        qa_dingo_cn_score: 100
        mmlu_accuracy: 76.23
        mmlu-stem_accuracy: 78.08
        mmlu-social-science_accuracy: 80.31
        mmlu-humanities_accuracy: 71.38
        mmlu-other_accuracy: 74.63
        cmmlu_accuracy: 83.11
        cmmlu-stem_accuracy: 79.42
        cmmlu-social-science_accuracy: 83.34
        cmmlu-humanities_accuracy: 83.95
        cmmlu-other_accuracy: 86.22
        cmmlu-china-specific_accuracy: 81.5
        mmlu_pro_accuracy: 58.16
        mmlu_pro_biology_accuracy: 74.62
        mmlu_pro_business_accuracy: 65.02
        mmlu_pro_chemistry_accuracy: 60.69
        mmlu_pro_computer_science_accuracy: 61.46
        mmlu_pro_economics_accuracy: 68.25
        mmlu_pro_engineering_accuracy: 45.3
        mmlu_pro_health_accuracy: 60.15
        mmlu_pro_history_accuracy: 50.66
        mmlu_pro_law_accuracy: 31.7
        mmlu_pro_math_accuracy: 70.32
        mmlu_pro_philosophy_accuracy: 47.7
        mmlu_pro_physics_accuracy: 59.51
        mmlu_pro_psychology_accuracy: 65.41
        mmlu_pro_other_accuracy: 53.46
        humanevalx-python_pass@1: 42.68
        humanevalx-cpp_pass@1: 19.51
        humanevalx-go_pass@1: 0
        humanevalx-java_pass@1: 72.56
        humanevalx-js_pass@1: 64.02
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 35
        ds1000_Tensorflow_accuracy: 24.44
        ds1000_Scipy_accuracy: 20.75
        ds1000_Sklearn_accuracy: 21.74
        ds1000_Pytorch_accuracy: 22.06
        ds1000_Matplotlib_accuracy: 56.77
        mmmlu_lite_accuracy: 42.6
        openai_mmmlu_lite_AR-XY_accuracy: 32.84
        openai_mmmlu_lite_BN-BD_accuracy: 10.46
        openai_mmmlu_lite_DE-DE_accuracy: 24.56
        openai_mmmlu_lite_ES-LA_accuracy: 50.95
        openai_mmmlu_lite_FR-FR_accuracy: 61.05
        openai_mmmlu_lite_HI-IN_accuracy: 30.6
        openai_mmmlu_lite_ID-ID_accuracy: 45.89
        openai_mmmlu_lite_IT-IT_accuracy: 51.79
        openai_mmmlu_lite_JA-JP_accuracy: 51.65
        openai_mmmlu_lite_KO-KR_accuracy: 48.77
        openai_mmmlu_lite_PT-BR_accuracy: 52.7
        openai_mmmlu_lite_SW-KE_accuracy: 32.91
        openai_mmmlu_lite_YO-NG_accuracy: 32.84
        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
        college_naive_average: 47
        high_naive_average: 66.67
        middle_naive_average: 81.67
        primary_naive_average: 89.33
        arithmetic_naive_average: 73.67
        mathbench-a (average)_naive_average: 71.67
        college_knowledge_naive_average: 82.91
        high_knowledge_naive_average: 79.86
        middle_knowledge_naive_average: 88.92
        primary_knowledge_naive_average: 92.96
        mathbench-t (average)_naive_average: 86.16
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -1,27 +1,30 @@
 chat:
    glm-4-9b-chat-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 56.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 84.38
    glm-4-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
    deepseek-7b-chat-hf:
        gsm8k_accuracy: 46.88
        race-high_accuracy: 81.25
-    deepseek-moe-16b-chat-hf:
+    deepseek-r1-distill-llama-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 31.25
-        race-high_accuracy: 68.75
+        race-high_accuracy: 81.25
    deepseek-r1-distill-qwen-1_5b-turbomind:
        gsm8k_accuracy: 37.5
        race-high_accuracy: 53.12
    deepseek-7b-chat-vllm:
        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
+        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
-        race-high_accuracy: 71.88
+        race-high_accuracy: 75
    gemma2-9b-it-hf:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2b-it-hf:
        gsm8k_accuracy: 3.12
@ -36,34 +39,40 @@ chat:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
-        gsm8k_accuracy: 34.38
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    internlm3-8b-instruct-hf:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 87.5
    internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 87.50
+        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    internlm2-chat-1.8b-turbomind:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 84.38
    internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 84.38
    internlm2-chat-7b-lmdeploy:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 59.38
        race-high_accuracy: 84.38
    internlm2-chat-7b-sft-turbomind:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    internlm2-chat-7b-vllm:
        gsm8k_accuracy: 56.25
-        race-high_accuracy: 84.38
+        race-high_accuracy: 90.62
    internlm3-8b-instruct-turbomind:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 87.5
    internlm2-chat-7b-vllm:
        gsm8k_accuracy: 59.38
        race-high_accuracy: 87.50
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 81.25
    llama-3-8b-instruct-hf:
        gsm8k_accuracy: 68.75
@ -72,14 +81,14 @@ chat:
        gsm8k_accuracy: 18.75
        race-high_accuracy: 46.88
    llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 81.25
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 81.25
    llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
-        race-high_accuracy: 87.5
+        race-high_accuracy: 84.38
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
@ -90,17 +99,14 @@ chat:
        gsm8k_accuracy: 75
        race-high_accuracy: 81.25
    mistral-nemo-instruct-2407-turbomind:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
-        race-high_accuracy: 87.50
+        race-high_accuracy: 78.12
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
-        race-high_accuracy: 68.75
+        race-high_accuracy: 65.62
    mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 21.88
-        race-high_accuracy: 75
+        race-high_accuracy: 78.12
    phi-3-mini-4k-instruct-hf:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    qwen2.5-0.5b-instruct-hf:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 46.88
@ -108,10 +114,10 @@ chat:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 31.25
-        race-high_accuracy: 50
+        race-high_accuracy: 43.75
    qwen2.5-3b-instruct-turbomind:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 90.62
    qwen1.5-0.5b-chat-hf:
        gsm8k_accuracy: 0
@ -123,11 +129,11 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 90.62
    qwen2-1.5b-instruct-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    qwen2-7b-instruct-turbomind:
        gsm8k_accuracy: 81.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
    qwen1.5-0.5b-chat-vllm:
        gsm8k_accuracy: 3.12
        race-high_accuracy: 53.12
@ -143,11 +149,11 @@ chat:
    yi-1.5-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
-    deepseek-v2-lite-chat-hf:
+    deepseek-v2_lite-chat-turbomind:
-        gsm8k_accuracy: 46.88
+        gsm8k_accuracy: 37.5
        race-high_accuracy: 71.88
    gemma2-27b-it-hf:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    internlm2_5-20b-chat-hf:
        gsm8k_accuracy: 84.38
@ -161,6 +167,9 @@ chat:
    mistral-small-instruct-2409-turbomind:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    phi-4:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    qwen2.5-14b-instruct-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
@ -168,40 +177,41 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 93.75
    yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 93.75
-    deepseek-67b-chat-hf:
+    deepseek-67b-chat-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
    deepseek-r1-distill-qwen-32b-turbomind:
        gsm8k_accuracy: 25
        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
    mixtral-8x7b-instruct-v0.1-hf:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 81.25
    mixtral-large-instruct-2411-turbomind:
-        gsm8k_accuracy: 90.62
+        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 87.5
+        gsm8k_accuracy: 93.75
-        race-high_accuracy: 46.88
+        race-high_accuracy: 50.00
    qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 81.25
-        race-high_accuracy: 93.75
+        race-high_accuracy: 90.62
    deepseek-r1-distill-llama-70b-turbomind:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 90.62
    deepseek-v2_5-1210-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 84.38
-    mixtral-8x22b-instruct-v0.1-hf:
+    mixtral-8x22b-instruct-v0.1-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 78.12
-        race-high_accuracy: 81.25
+        race-high_accuracy: 78.12
    mixtral-8x22b-instruct-v0.1-vllm:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 78.12
 base:
    glm-4-9b-hf:
        gsm8k_accuracy: 68.75
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    glm-4-9b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
@ -210,15 +220,10 @@ base:
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 46.88
        winogrande_accuracy: 71.88
    deepseek-moe-16b-base-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 21.88
        winogrande_accuracy: 65.62
    deepseek-7b-base-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 18.75
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
        gsm8k_accuracy: 21.88
@ -226,35 +231,40 @@ base:
        race-high_accuracy: 25
        winogrande_accuracy: 68.75
    gemma2-2b-hf:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 56.25
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75.00
    gemma2-9b-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 81.25
+        race-high_accuracy: 84.38
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
    gemma-2b-hf:
-        gsm8k_accuracy: 18.75
+        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 25
+        race-high_accuracy: 21.88
        winogrande_accuracy: 53.12
    gemma-7b-hf:
        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 6.25
+        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 71.88
    gemma-2-9b-turbomind:
        gsm8k_accuracy: 68.75
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 18.75
        winogrande_accuracy: 46.88
    gemma-2b-vllm:
        gsm8k_accuracy: 15.62
        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy:
+        race-high_accuracy: 28.12
-        winogrande_accuracy:
+        winogrande_accuracy: 68.75
    gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 43.75
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 6.25
-        race-high_accuracy:
+        race-high_accuracy: 81.25
-        winogrande_accuracy:
+        winogrande_accuracy: 81.25
    internlm2_5-7b-hf:
        gsm8k_accuracy: 37.5
        GPQA_diamond_accuracy: 25
@ -265,31 +275,26 @@ base:
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 62.5
        winogrande_accuracy: 78.12
    internlm2-base-7b-hf:
        gsm8k_accuracy: 3.12
        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 75
        winogrande_accuracy: 65.62
    internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 12.5
+        gsm8k_accuracy: 6.25
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 71.88
        winogrande_accuracy: 78.12
    internlm2_5-7b-turbomind:
        gsm8k_accuracy: 62.50
        GPQA_diamond_accuracy: 34.38
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.50
    internlm2-7b-turbomind:
        gsm8k_accuracy: 53.12
        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 71.88
        winogrande_accuracy: 84.38
    internlm2-base-7b-turbomind:
        gsm8k_accuracy: 37.50
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    internlm2_5-7b-turbomind:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    internlm2-7b-turbomind:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 34.38
        race-high_accuracy: 78.12
        winogrande_accuracy: 71.88
    internlm2-base-7b-turbomind:
        gsm8k_accuracy: 28.12
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 71.88
        winogrande_accuracy: 62.50
    llama-2-7b-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 21.88
@ -306,15 +311,15 @@ base:
        race-high_accuracy: 65.62
        winogrande_accuracy: 65.62
    llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 78.12
        winogrande_accuracy: 78.12
    llama-3-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 46.88
        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 81.25
    mistral-7b-v0.3-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 6.25
@ -326,15 +331,15 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    qwen2.5-1.5b-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 12.50
+        GPQA_diamond_accuracy: 18.75
-        race-high_accuracy: 78.12
+        race-high_accuracy: 75
        winogrande_accuracy: 68.75
    qwen2.5-7b-turbomind:
        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 25
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    qwen2.5-7b-turbomind:
        gsm8k_accuracy: 71.88
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 87.5
        winogrande_accuracy: 75.00
    qwen1.5-moe-a2.7b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 18.75
@ -356,20 +361,20 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 68.75
    qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    qwen2-7b-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75
    qwen1.5-0.5b-vllm:
        gsm8k_accuracy: 9.38
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 56.25
-        winogrande_accuracy: 62.5
+        winogrande_accuracy: 59.38
    yi-1.5-6b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
@ -384,25 +389,10 @@ base:
        gsm8k_accuracy: 78.12
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    deepseek-v2-lite-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 59.38
        winogrande_accuracy: 71.88
    internlm2-20b-hf:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 68.75
        winogrande_accuracy: 75
    internlm2-base-20b-hf:
        gsm8k_accuracy: 12.5
        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 84.38
        winogrande_accuracy: 65.62
    internlm2-20b-turbomind:
        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 15.62
+        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 68.75
        winogrande_accuracy: 81.25
    qwen2.5-14b-hf:
@ -420,33 +410,23 @@ base:
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
    deepseek-67b-base-hf:
        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 90.62
    deepseek-67b-base-turbomind:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 81.25
        winogrande_accuracy: 84.38
    llama-3-70b-turbomind:
        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 34.38
        race-high_accuracy: 78.12
        winogrande_accuracy: 81.25
    llama-3-70b-turbomind:
        gsm8k_accuracy: 56.25
        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    qwen2.5-72b-turbomind:
        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 34.38
+        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    deepseek-v2-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 3.12
+        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    llama-3-70b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -17,7 +17,7 @@ on:
        required: false
        description: 'whether to build lmdeploy'
        type:  boolean
-        default: false
+        default: true
      repo_org_lmdeploy:
        required: false
        description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -44,7 +44,7 @@ on:
        type: string
        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
  schedule:
-    - cron:  '15 14 * * *'
+    - cron:  '15 14 * * 0,3'
 env:
  HF_DATASETS_OFFLINE: 1
@ -61,6 +61,7 @@ env:
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  CONDA_ENV: regression_test
  export VLLM_WORKER_MULTIPROC_METHOD: spawn
 jobs:
  build-pypi:
@ -87,12 +88,11 @@ jobs:
          name: my-artifact-${{ github.run_id }}
  build-pypi-lmdeploy:
-    if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
    strategy:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
    environment: 'prod'
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
@ -126,8 +126,7 @@ jobs:
    if: ${{!cancelled()}}
    needs: ['build-pypi', 'build-pypi-lmdeploy']
    runs-on: volc_cu12
-    environment: 'prod'
+    timeout-minutes: 120 #2hours
    timeout-minutes: 240 #4hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -148,7 +147,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 240
+          timeout_minutes: 120
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda create -y --name ${{env.CONDA_ENV}} python=3.10
@ -157,20 +156,23 @@ jobs:
            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          pip uninstall -y lmdeploy
          pip install lmdeploy-*.whl --no-deps
      - name: conda env
        run: |
@ -187,8 +189,7 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
    runs-on: volc_cu12_daily
-    environment: 'prod'
+    timeout-minutes: 180 #3hours
    timeout-minutes: 120 #2hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -210,7 +211,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 120
+          timeout_minutes: 180
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
@ -228,8 +229,7 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
    runs-on: volc_cu12_local
-    environment: 'prod'
+    timeout-minutes: 480 #6hours
    timeout-minutes: 240 #4hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -255,27 +255,33 @@ jobs:
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api
        if: matrix.regression_func == 'api'
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
-          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 120s
+          sleep 180s
          env | grep PROXY
          env | grep proxy
          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -304,8 +310,7 @@ jobs:
      matrix:
        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
    runs-on: volc_cu12
-    environment: 'prod'
+    timeout-minutes: 480 #6hours
    timeout-minutes: 360 #6hours
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
@ -322,7 +327,7 @@ jobs:
        uses: nick-fields/retry@v3
        with:
          max_attempts: 1
-          timeout_minutes: 360
+          timeout_minutes: 480
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
            conda activate ${{env.CONDA_ENV}}
@ -334,11 +339,10 @@ jobs:
  notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
    environment: 'prod'
    steps:
      - name: notify
        run: |
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -45,7 +45,7 @@ jobs:
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: conda env
        run: |
--- a/.github/workflows/pr-stage-check.yml
+++ b/.github/workflows/pr-stage-check.yml
@ -20,7 +20,7 @@ jobs:
      matrix:
        python-version: ['3.10']
        include:
-          - torch: 2.0.0
+          - torch: 2.5.1
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
@ -30,7 +30,7 @@ jobs:
      - name: Upgrade pip
        run: python -m pip install --upgrade pip
      - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+        run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install system dependencies
        run: |
          sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
@ -106,7 +106,7 @@ jobs:
      - name: Upgrade pip
        run: python -m pip install pip --upgrade
      - name: Install PyTorch
-        run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+        run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
      - name: Install opencompass dependencies
        run: |
          pip install -r requirements.txt
--- a/README.md
+++ b/README.md
@ -57,6 +57,10 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 - **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
 - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
@ -173,69 +177,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
- Your first evaluation with OpenCompass!
+### Your first evaluation with OpenCompass!
-  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
+OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
-  ```bash
+```bash
-  # CLI
+# CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_chat_demo.py
+opencompass examples/eval_chat_demo.py
-  ```
+```
-  You can find more script examples under [examples](./examples) folder.
+You can find more script examples under [examples](./examples) folder.
- API evaluation
+### API evaluation
-  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
-  ```bash
+```bash
-  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
-  # CLI
+# CLI
-  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_api_demo.py
+opencompass examples/eval_api_demo.py
-  # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
+# You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
-  ```
+```
- Accelerated Evaluation
+### Accelerated Evaluation
-  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
-  ```bash
+```bash
-  # CLI
+# CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
-  # Python scripts
+# Python scripts
-  opencompass examples/eval_lmdeploy_demo.py
+opencompass examples/eval_lmdeploy_demo.py
-  ```
+```
- Supported Models
+### Supported Models and Datasets
-  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
-  ```bash
+```bash
-  # List all configurations
+# List all configurations
-  python tools/list_configs.py
+python tools/list_configs.py
-  # List all configurations related to llama and mmlu
+# List all configurations related to llama and mmlu
-  python tools/list_configs.py llama mmlu
+python tools/list_configs.py llama mmlu
-  ```
+```
-  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+#### Supported Models
-  ```bash
+If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```
-  If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+```bash
 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
 ```
-  ```bash
+#### Supported Datasets
-  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
+
-  ```
+Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
 ```bash
 # Recommended Evaluation Config based on Rules
 opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
 # Recommended Evaluation Config based on LLM Judge
 opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
 ```
 If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
 ```
 > \[!TIP\]
 >
@ -279,263 +297,15 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
 ## 📖 Dataset Support
-<table align="center">
+We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>Language</b>
      </td>
      <td>
        <b>Knowledge</b>
      </td>
      <td>
        <b>Reasoning</b>
      </td>
      <td>
        <b>Examination</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>Word Definition</b></summary>
- WiC
+You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
 - SummEdits
-</details>
+In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
-<details open>
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
 <summary><b>Idiom Learning</b></summary>
- CHID
+<p align="right"><a href="#top">🔝Back to top</a></p>
 </details>
 <details open>
 <summary><b>Semantic Similarity</b></summary>
 - AFQMC
 - BUSTM
 </details>
 <details open>
 <summary><b>Coreference Resolution</b></summary>
 - CLUEWSC
 - WSC
 - WinoGrande
 </details>
 <details open>
 <summary><b>Translation</b></summary>
 - Flores
 - IWSLT2017
 </details>
 <details open>
 <summary><b>Multi-language Question Answering</b></summary>
 - TyDi-QA
 - XCOPA
 </details>
 <details open>
 <summary><b>Multi-language Summary</b></summary>
 - XLSum
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Knowledge Question Answering</b></summary>
 - BoolQ
 - CommonSenseQA
 - NaturalQuestions
 - TriviaQA
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Textual Entailment</b></summary>
 - CMNLI
 - OCNLI
 - OCNLI_FC
 - AX-b
 - AX-g
 - CB
 - RTE
 - ANLI
 </details>
 <details open>
 <summary><b>Commonsense Reasoning</b></summary>
 - StoryCloze
 - COPA
 - ReCoRD
 - HellaSwag
 - PIQA
 - SIQA
 </details>
 <details open>
 <summary><b>Mathematical Reasoning</b></summary>
 - MATH
 - GSM8K
 </details>
 <details open>
 <summary><b>Theorem Application</b></summary>
 - TheoremQA
 - StrategyQA
 - SciBench
 </details>
 <details open>
 <summary><b>Comprehensive Reasoning</b></summary>
 - BBH
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Junior High, High School, University, Professional Examinations</b></summary>
 - C-Eval
 - AGIEval
 - MMLU
 - GAOKAO-Bench
 - CMMLU
 - ARC
 - Xiezhi
 </details>
 <details open>
 <summary><b>Medical Examinations</b></summary>
 - CMB
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>Understanding</b>
      </td>
      <td>
        <b>Long Context</b>
      </td>
      <td>
        <b>Safety</b>
      </td>
      <td>
        <b>Code</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>Reading Comprehension</b></summary>
 - C3
 - CMRC
 - DRCD
 - MultiRC
 - RACE
 - DROP
 - OpenBookQA
 - SQuAD2.0
 </details>
 <details open>
 <summary><b>Content Summary</b></summary>
 - CSL
 - LCSTS
 - XSum
 - SummScreen
 </details>
 <details open>
 <summary><b>Content Analysis</b></summary>
 - EPRSTMT
 - LAMBADA
 - TNEWS
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Long Context Understanding</b></summary>
 - LEval
 - LongBench
 - GovReports
 - NarrativeQA
 - Qasper
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Safety</b></summary>
 - CivilComments
 - CrowsPairs
 - CValues
 - JigsawMultilingual
 - TruthfulQA
 </details>
 <details open>
 <summary><b>Robustness</b></summary>
 - AdvGLUE
 </details>
      </td>
      <td>
 <details open>
 <summary><b>Code</b></summary>
 - HumanEval
 - HumanEvalX
 - MBPP
 - APPs
 - DS1000
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
 </table>
 ## 📖 Model Support
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -57,6 +57,10 @@
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 - **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
@ -205,9 +209,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
  ```
-  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+- ### 支持的模型与数据集
- ### 支持的模型
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
  ```bash
  # 列出所有配置
@ -216,13 +220,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  python tools/list_configs.py llama mmlu
  ```
-  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+  #### 支持的模型
  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
  ```bash
  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
  ```
-  如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
+  #### 支持的数据集
  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
  ```bash
  # 基于规则的推荐配置
  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
  # 基于LLM Judge的推荐配置
  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
  ```
  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
  ```bash
  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@ -274,263 +292,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 ## 📖 数据集支持
-<table align="center">
+我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>语言</b>
      </td>
      <td>
        <b>知识</b>
      </td>
      <td>
        <b>推理</b>
      </td>
      <td>
        <b>考试</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>字词释义</b></summary>
- WiC
+您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 - SummEdits
-</details>
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
 <details open>
 <summary><b>成语习语</b></summary>
 - CHID
 </details>
 <details open>
 <summary><b>语义相似度</b></summary>
 - AFQMC
 - BUSTM
 </details>
 <details open>
 <summary><b>指代消解</b></summary>
 - CLUEWSC
 - WSC
 - WinoGrande
 </details>
 <details open>
 <summary><b>翻译</b></summary>
 - Flores
 - IWSLT2017
 </details>
 <details open>
 <summary><b>多语种问答</b></summary>
 - TyDi-QA
 - XCOPA
 </details>
 <details open>
 <summary><b>多语种总结</b></summary>
 - XLSum
 </details>
      </td>
      <td>
 <details open>
 <summary><b>知识问答</b></summary>
 - BoolQ
 - CommonSenseQA
 - NaturalQuestions
 - TriviaQA
 </details>
      </td>
      <td>
 <details open>
 <summary><b>文本蕴含</b></summary>
 - CMNLI
 - OCNLI
 - OCNLI_FC
 - AX-b
 - AX-g
 - CB
 - RTE
 - ANLI
 </details>
 <details open>
 <summary><b>常识推理</b></summary>
 - StoryCloze
 - COPA
 - ReCoRD
 - HellaSwag
 - PIQA
 - SIQA
 </details>
 <details open>
 <summary><b>数学推理</b></summary>
 - MATH
 - GSM8K
 </details>
 <details open>
 <summary><b>定理应用</b></summary>
 - TheoremQA
 - StrategyQA
 - SciBench
 </details>
 <details open>
 <summary><b>综合推理</b></summary>
 - BBH
 </details>
      </td>
      <td>
 <details open>
 <summary><b>初中/高中/大学/职业考试</b></summary>
 - C-Eval
 - AGIEval
 - MMLU
 - GAOKAO-Bench
 - CMMLU
 - ARC
 - Xiezhi
 </details>
 <details open>
 <summary><b>医学考试</b></summary>
 - CMB
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
  <tbody>
    <tr align="center" valign="bottom">
      <td>
        <b>理解</b>
      </td>
      <td>
        <b>长文本</b>
      </td>
      <td>
        <b>安全</b>
      </td>
      <td>
        <b>代码</b>
      </td>
    </tr>
    <tr valign="top">
      <td>
 <details open>
 <summary><b>阅读理解</b></summary>
 - C3
 - CMRC
 - DRCD
 - MultiRC
 - RACE
 - DROP
 - OpenBookQA
 - SQuAD2.0
 </details>
 <details open>
 <summary><b>内容总结</b></summary>
 - CSL
 - LCSTS
 - XSum
 - SummScreen
 </details>
 <details open>
 <summary><b>内容分析</b></summary>
 - EPRSTMT
 - LAMBADA
 - TNEWS
 </details>
      </td>
      <td>
 <details open>
 <summary><b>长文本理解</b></summary>
 - LEval
 - LongBench
 - GovReports
 - NarrativeQA
 - Qasper
 </details>
      </td>
      <td>
 <details open>
 <summary><b>安全</b></summary>
 - CivilComments
 - CrowsPairs
 - CValues
 - JigsawMultilingual
 - TruthfulQA
 </details>
 <details open>
 <summary><b>健壮性</b></summary>
 - AdvGLUE
 </details>
      </td>
      <td>
 <details open>
 <summary><b>代码</b></summary>
 - HumanEval
 - HumanEvalX
 - MBPP
 - APPs
 - DS1000
 </details>
      </td>
    </tr>
 </td>
    </tr>
  </tbody>
 </table>
 <p align="right"><a href="#top">🔝返回顶部</a></p>
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -0,0 +1,999 @@
 - ifeval:
    name: IFEval
    category: Instruction Following
    paper: https://arxiv.org/pdf/2311.07911
    configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
    configpath_llmjudge: ''
 - nphard:
    name: NPHardEval
    category: Reasoning
    paper: https://arxiv.org/pdf/2312.14890v2
    configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
    configpath_llmjudge: ''
 - pmmeval:
    name: PMMEval
    category: Language
    paper: https://arxiv.org/pdf/2411.09116v1
    configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
    configpath_llmjudge: ''
 - theoremqa:
    name: TheroremQA
    category: Reasoning
    paper: https://arxiv.org/pdf/2305.12524
    configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
    configpath_llmjudge: ''
 - agieval:
    name: AGIEval
    category: Examination
    paper: https://arxiv.org/pdf/2304.06364
    configpath: opencompass/configs/datasets/agieval/agieval_gen.py
    configpath_llmjudge: ''
 - babilong:
    name: BABILong
    category: Long Context
    paper: https://arxiv.org/pdf/2406.10149
    configpath: opencompass/configs/datasets/babilong
    configpath_llmjudge: ''
 - bigcodebench:
    name: BigCodeBench
    category: Code
    paper: https://arxiv.org/pdf/2406.15877
    configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
    configpath_llmjudge: ''
 - calm:
    name: CaLM
    category: Reasoning
    paper: https://arxiv.org/pdf/2405.00622
    configpath: opencompass/configs/datasets/calm/calm.py
    configpath_llmjudge: ''
 - infinitebench:
    name: InfiniteBench (∞Bench)
    category: Long Context
    paper: https://aclanthology.org/2024.acl-long.814.pdf
    configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
    configpath_llmjudge: ''
 - korbench:
    name: KOR-Bench
    category: Reasoning
    paper: https://arxiv.org/pdf/2410.06526v1
    configpath: opencompass/configs/datasets/korbench/korbench_gen.py
    configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
 - lawbench:
    name: LawBench
    category: Knowledge / Law
    paper: https://arxiv.org/pdf/2309.16289
    configpath:
      - opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
      - opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
    configpath_llmjudge: ''
 - leval:
    name: L-Eval
    category: Long Context
    paper: https://arxiv.org/pdf/2307.11088v1
    configpath: opencompass/configs/datasets/leval/leval.py
    configpath_llmjudge: ''
 - livecodebench:
    name: LiveCodeBench
    category: Code
    paper: https://arxiv.org/pdf/2403.07974
    configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
    configpath_llmjudge: ''
 - livemathbench:
    name: LiveMathBench
    category: Math
    paper: https://arxiv.org/pdf/2412.13147
    configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
    configpath_llmjudge: ''
 - livereasonbench:
    name: LiveReasonBench
    category: Reasoning
    paper: ''
    configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
    configpath_llmjudge: ''
 - longbench:
    name: LongBench
    category: Long Context
    paper: https://github.com/THUDM/LongBench
    configpath:
      - opencompass/configs/datasets/longbench/longbench.py
      - opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
    configpath_llmjudge: ''
 - lveval:
    name: LV-Eval
    category: Long Context
    paper: https://arxiv.org/pdf/2402.05136
    configpath: opencompass/configs/datasets/lveval/lveval.py
    configpath_llmjudge: ''
 - mastermath2024v1:
    name: Mastermath2024v1
    category: Math
    paper: ''
    configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
    configpath_llmjudge: ''
 - medbench:
    name: MedBench
    category: Knowledge / Medicine
    paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
    configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
    configpath_llmjudge: ''
 - MedXpertQA:
    name: MedXpertQA
    category: Knowledge / Medicine
    paper: https://arxiv.org/abs/2501.18362
    configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
    configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
 - musr:
    name: MuSR
    category: Reasoning
    paper: https://arxiv.org/pdf/2310.16049
    configpath: opencompass/configs/datasets/musr/musr_gen.py
    configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
 - needlebench:
    name: NeedleBench
    category: Long Context
    paper: https://arxiv.org/pdf/2407.11963
    configpath: opencompass/configs/datasets/needlebench
    configpath_llmjudge: ''
 - ruler:
    name: RULER
    category: Long Context
    paper: https://arxiv.org/pdf/2404.06654
    configpath: opencompass/configs/datasets/ruler
    configpath_llmjudge: ''
 - alignment:
    name: AlignBench
    category: Subjective / Alignment
    paper: https://arxiv.org/pdf/2311.18743
    configpath: opencompass/configs/datasets/subjective/alignbench
    configpath_llmjudge: ''
 - alpaca:
    name: AlpacaEval
    category: Subjective / Instruction Following
    paper: https://github.com/tatsu-lab/alpaca_eval
    configpath: opencompass/configs/datasets/subjective/aplaca_eval
    configpath_llmjudge: ''
 - arenahard:
    name: Arena-Hard
    category: Subjective / Chatbot
    paper: https://lmsys.org/blog/2024-04-19-arena-hard/
    configpath: opencompass/configs/datasets/subjective/arena_hard
    configpath_llmjudge: ''
 - flames:
    name: FLAMES
    category: Subjective / Alignment
    paper: https://arxiv.org/pdf/2311.06899
    configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
    configpath_llmjudge: ''
 - fofo:
    name: FOFO
    category: Subjective / Format Following
    paper: https://arxiv.org/pdf/2402.18667
    configpath: opencompass/configs/datasets/subjective/fofo
    configpath_llmjudge: ''
 - followbench:
    name: FollowBench
    category: Subjective / Instruction Following
    paper: https://arxiv.org/pdf/2310.20410
    configpath: opencompass/configs/datasets/subjective/followbench
    configpath_llmjudge: ''
 - hellobench:
    name: HelloBench
    category: Subjective / Long Context
    paper: https://arxiv.org/pdf/2409.16191
    configpath: opencompass/configs/datasets/subjective/hellobench
    configpath_llmjudge: ''
 - judgerbench:
    name: JudgerBench
    category: Subjective / Long Context
    paper: https://arxiv.org/pdf/2410.16256
    configpath: opencompass/configs/datasets/subjective/judgerbench
    configpath_llmjudge: ''
 - multiround:
    name: MT-Bench-101
    category: Subjective / Multi-Round
    paper: https://arxiv.org/pdf/2402.14762
    configpath: opencompass/configs/datasets/subjective/multiround
    configpath_llmjudge: ''
 - wildbench:
    name: WildBench
    category: Subjective / Real Task
    paper: https://arxiv.org/pdf/2406.04770
    configpath: opencompass/configs/datasets/subjective/wildbench
    configpath_llmjudge: ''
 - teval:
    name: T-Eval
    category: Tool Utilization
    paper: https://arxiv.org/pdf/2312.14033
    configpath:
      - opencompass/configs/datasets/teval/teval_en_gen.py
      - opencompass/configs/datasets/teval/teval_zh_gen.py
    configpath_llmjudge: ''
 - finalceiq:
    name: FinanceIQ
    category: Knowledge / Finance
    paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
    configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
    configpath_llmjudge: ''
 - gaokaobench:
    name: GAOKAOBench
    category: Examination
    paper: https://arxiv.org/pdf/2305.12474
    configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
    configpath_llmjudge: ''
 - lcbench:
    name: LCBench
    category: Code
    paper: https://github.com/open-compass/CodeBench/
    configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
    configpath_llmjudge: ''
 - MMLUArabic:
    name: ArabicMMLU
    category: Language
    paper: https://arxiv.org/pdf/2402.12840
    configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
    configpath_llmjudge: ''
 - OpenFinData:
    name: OpenFinData
    category: Knowledge / Finance
    paper: https://github.com/open-compass/OpenFinData
    configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
    configpath_llmjudge: ''
 - QuALITY:
    name: QuALITY
    category: Long Context
    paper: https://arxiv.org/pdf/2112.08608
    configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
    configpath_llmjudge: ''
 - advglue:
    name: Adversarial GLUE
    category: Safety
    paper: https://openreview.net/pdf?id=GF9cSKI3A_q
    configpath:
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
    configpath_llmjudge: ''
 - afqmcd:
    name: CLUE / AFQMC
    category: Language
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
    configpath_llmjudge: ''
 - aime2024:
    name: AIME2024
    category: Examination
    paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
    configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
 - anli:
    name: Adversarial NLI
    category: Reasoning
    paper: https://arxiv.org/pdf/1910.14599v2
    configpath: opencompass/configs/datasets/anli/anli_gen.py
    configpath_llmjudge: ''
 - anthropics_evals:
    name: Anthropics Evals
    category: Safety
    paper: https://arxiv.org/pdf/2212.09251
    configpath:
    - opencompass/configs/datasets/anthropics_evals/airisk_gen.py
    - opencompass/configs/datasets/anthropics_evals/persona_gen.py
    - opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
    configpath_llmjudge: ''
 - apps:
    name: APPS
    category: Code
    paper: https://arxiv.org/pdf/2105.09938
    configpath:
    - opencompass/configs/datasets/apps/apps_gen.py
    - opencompass/configs/datasets/apps/apps_mini_gen.py
    configpath_llmjudge: ''
 - arc:
    name: ARC
    category: Reasoning
    paper: https://arxiv.org/pdf/1803.05457
    configpath:
    - opencompass/configs/datasets/ARC_c/ARC_c_gen.py
    - opencompass/configs/datasets/ARC_e/ARC_e_gen.py
    configpath_llmjudge: ''
 - arc_prize_public_eval:
    name: ARC Prize
    category: ARC-AGI
    paper: https://arcprize.org/guide#private
    configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
    configpath_llmjudge: ''
 - ax:
    name: SuperGLUE / AX
    category: Reasoning
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath:
    - opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
    - opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
    configpath_llmjudge: ''
 - bbh:
    name: BIG-Bench Hard
    category: Reasoning
    paper: https://arxiv.org/pdf/2210.09261
    configpath: opencompass/configs/datasets/bbh/bbh_gen.py
    configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
 - bbeh:
    name: BIG-Bench Extra Hard
    category: Reasoning
    paper: https://arxiv.org/abs/2502.19187
    configpath: opencompass/configs/datasets/bbeh
    configpath_llmjudge: ''
 - BoolQ:
    name: SuperGLUE / BoolQ
    category: Knowledge
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
    configpath_llmjudge: ''
 - c3:
    name: CLUE / C3 (C³)
    category: Understanding
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
    configpath_llmjudge: ''
 - cb:
    name: SuperGLUE / CB
    category: Reasoning
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
    configpath_llmjudge: ''
 - ceval:
    name: C-EVAL
    category: Examination
    paper: https://arxiv.org/pdf/2305.08322v1
    configpath: opencompass/configs/datasets/ceval/ceval_gen.py
    configpath_llmjudge: ''
 - charm:
    name: CHARM
    category: Reasoning
    paper: https://arxiv.org/pdf/2403.14112
    configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
    configpath_llmjudge: ''
 - chembench:
    name: ChemBench
    category: Knowledge / Chemistry
    paper: https://arxiv.org/pdf/2404.01475
    configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
    configpath_llmjudge: ''
 - chid:
    name: FewCLUE / CHID
    category: Language
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
    configpath_llmjudge: ''
 - chinese_simpleqa:
    name: Chinese SimpleQA
    category: Knowledge
    paper: https://arxiv.org/pdf/2411.07140
    configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
    configpath_llmjudge: ''
 - cibench:
    name: CIBench
    category: Code
    paper: https://www.arxiv.org/pdf/2407.10499
    configpath:
      - opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
      - opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
      - opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
    configpath_llmjudge: ''
 - civilcomments:
    name: CivilComments
    category: Safety
    paper: https://arxiv.org/pdf/1903.04561
    configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
    configpath_llmjudge: ''
 - clozeTest_maxmin:
    name: Cloze Test-max/min
    category: Code
    paper: https://arxiv.org/pdf/2102.04664
    configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
    configpath_llmjudge: ''
 - cluewsc:
    name: FewCLUE / CLUEWSC
    category: Language / WSC
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
    configpath_llmjudge: ''
 - cmb:
    name: CMB
    category: Knowledge / Medicine
    paper: https://arxiv.org/pdf/2308.08833
    configpath: opencompass/configs/datasets/cmb/cmb_gen.py
    configpath_llmjudge: ''
 - cmmlu:
    name: CMMLU
    category: Understanding
    paper: https://arxiv.org/pdf/2306.09212
    configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
    configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
 - cmnli:
    name: CLUE / CMNLI
    category: Reasoning
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
    configpath_llmjudge: ''
 - cmo_fib:
    name: cmo_fib
    category: Examination
    paper: ''
    configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
    configpath_llmjudge: ''
 - cmrc:
    name: CLUE / CMRC
    category: Understanding
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
    configpath_llmjudge: ''
 - commonsenseqa:
    name: CommonSenseQA
    category: Knowledge
    paper: https://arxiv.org/pdf/1811.00937v2
    configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
    configpath_llmjudge: ''
 - commonsenseqa_cn:
    name: CommonSenseQA-CN
    category: Knowledge
    paper: ''
    configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
    configpath_llmjudge: ''
 - copa:
    name: SuperGLUE / COPA
    category: Reasoning
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
    configpath_llmjudge: ''
 - crowspairs:
    name: CrowsPairs
    category: Safety
    paper: https://arxiv.org/pdf/2010.00133
    configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
    configpath_llmjudge: ''
 - crowspairs_cn:
    name: CrowsPairs-CN
    category: Safety
    paper: ''
    configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
    configpath_llmjudge: ''
 - cvalues:
    name: CVALUES
    category: Safety
    paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
    configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
    configpath_llmjudge: ''
 - drcd:
    name: CLUE / DRCD
    category: Understanding
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
    configpath_llmjudge: ''
 - drop:
    name: DROP (DROP Simple Eval)
    category: Understanding
    paper: https://arxiv.org/pdf/1903.00161
    configpath: opencompass/configs/datasets/drop/drop_gen.py
    configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
 - ds1000:
    name: DS-1000
    category: Code
    paper: https://arxiv.org/pdf/2211.11501
    configpath:
    - opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
    configpath_llmjudge: ''
 - eprstmt:
    name: FewCLUE / EPRSTMT
    category: Understanding
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
    configpath_llmjudge: ''
 - flores:
    name: Flores
    category: Language
    paper: https://aclanthology.org/D19-1632.pdf
    configpath: opencompass/configs/datasets/flores/flores_gen.py
    configpath_llmjudge: ''
 - game24:
    name: Game24
    category: Math
    paper: https://huggingface.co/datasets/nlile/24-game
    configpath: opencompass/configs/datasets/game24/game24_gen.py
    configpath_llmjudge: ''
 - govrepcrs:
    name: Government Report Dataset
    category: Long Context
    paper: https://aclanthology.org/2021.naacl-main.112.pdf
    configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
    configpath_llmjudge: ''
 - gpqa:
    name: GPQA
    category: Knowledge
    paper: https://arxiv.org/pdf/2311.12022v1
    configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
    configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
 - gsm8k:
    name: GSM8K
    category: Math
    paper: https://arxiv.org/pdf/2110.14168v2
    configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
    configpath_llmjudge: ''
 - gsm_hard:
    name: GSM-Hard
    category: Math
    paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
    configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
    configpath_llmjudge: ''
 - hle:
    name: HLE(Humanity's Last Exam)
    category: Reasoning
    paper: https://lastexam.ai/paper
    configpath: opencompass/configs/datasets/HLE/hle_gen.py
    configpath_llmjudge: ''
 - hellaswag:
    name: HellaSwag
    category: Reasoning
    paper: https://arxiv.org/pdf/1905.07830
    configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
    configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
 - humaneval:
    name: HumanEval
    category: Code
    paper: https://arxiv.org/pdf/2107.03374v2
    configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
    configpath_llmjudge: ''
 - humaneval_cn:
    name: HumanEval-CN
    category: Code
    paper: ''
    configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
    configpath_llmjudge: ''
 - humaneval_multi:
    name: Multi-HumanEval
    category: Code
    paper: https://arxiv.org/pdf/2210.14868
    configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
    configpath_llmjudge: ''
 - humaneval_multi:
    name: HumanEval+
    category: Code
    paper: https://arxiv.org/pdf/2305.01210
    configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
    configpath_llmjudge: ''
 - humanevalx:
    name: HumanEval-X
    category: Code
    paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
    configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
    configpath_llmjudge: ''
 - hungarian_math:
    name: Hungarian_Math
    category: Math
    paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
    configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
    configpath_llmjudge: ''
 - iwslt2017:
    name: IWSLT2017
    category: Language
    paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
    configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
    configpath_llmjudge: ''
 - jigsawmultilingual:
    name: JigsawMultilingual
    category: Safety
    paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
    configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
    configpath_llmjudge: ''
 - lambada:
    name: LAMBADA
    category: Understanding
    paper: https://arxiv.org/pdf/1606.06031
    configpath: opencompass/configs/datasets/lambada/lambada_gen.py
    configpath_llmjudge: ''
 - lcsts:
    name: LCSTS
    category: Understanding
    paper: https://aclanthology.org/D15-1229.pdf
    configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
    configpath_llmjudge: ''
 - livestembench:
    name: LiveStemBench
    category: ''
    paper: ''
    configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
    configpath_llmjudge: ''
 - llm_compression:
    name: LLM Compression
    category: Bits Per Character (BPC)
    paper: https://arxiv.org/pdf/2404.09937
    configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
    configpath_llmjudge: ''
 - math:
    name: MATH
    category: Math
    paper: https://arxiv.org/pdf/2103.03874
    configpath: opencompass/configs/datasets/math/math_gen.py
    configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
 - math500:
    name: MATH500
    category: Math
    paper: https://github.com/openai/prm800k
    configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
    configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
 - math401:
    name: MATH 401
    category: Math
    paper: https://arxiv.org/pdf/2304.02015
    configpath: opencompass/configs/datasets/math401/math401_gen.py
    configpath_llmjudge: ''
 - mathbench:
    name: MathBench
    category: Math
    paper: https://arxiv.org/pdf/2405.12209
    configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
    configpath_llmjudge: ''
 - mbpp:
    name: MBPP
    category: Code
    paper: https://arxiv.org/pdf/2108.07732
    configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
    configpath_llmjudge: ''
 - mbpp_cn:
    name: MBPP-CN
    category: Code
    paper: ''
    configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
    configpath_llmjudge: ''
 - mbpp_plus:
    name: MBPP-PLUS
    category: Code
    paper: ''
    configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
    configpath_llmjudge: ''
 - mgsm:
    name: MGSM
    category: Language / Math
    paper: https://arxiv.org/pdf/2210.03057
    configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
    configpath_llmjudge: ''
 - mmlu:
    name: MMLU
    category: Understanding
    paper: https://arxiv.org/pdf/2009.03300
    configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
 - mmlu_cf:
    name: MMLU-CF
    category: Understanding
    paper: https://arxiv.org/pdf/2412.15194
    configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
    configpath_llmjudge: ''
 - mmlu_pro:
    name: MMLU-Pro
    category: Understanding
    paper: https://arxiv.org/pdf/2406.01574
    configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
    configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
 - mmmlu:
    name: MMMLU
    category: Language / Understanding
    paper: https://huggingface.co/datasets/openai/MMMLU
    configpath:
      - opencompass/configs/datasets/mmmlu/mmmlu_gen.py
      - opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
    configpath_llmjudge: ''
 - multirc:
    name: SuperGLUE / MultiRC
    category: Understanding
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
    configpath_llmjudge: ''
 - multipl_e:
    name: MultiPL-E
    category: Code
    paper: https://arxiv.org/pdf/2210.14868
    configpath: opencompass/configs/datasets/multipl_e
    configpath_llmjudge: ''
 - narrativeqa:
    name: NarrativeQA
    category: Understanding
    paper: https://github.com/google-deepmind/narrativeqa
    configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
    configpath_llmjudge: ''
 - natural_question:
    name: NaturalQuestions
    category: Knowledge
    paper: https://github.com/google-research-datasets/natural-questions
    configpath: opencompass/configs/datasets/nq/nq_gen.py
    configpath_llmjudge: ''
 - natural_question_cn:
    name: NaturalQuestions-CN
    category: Knowledge
    paper: ''
    configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
    configpath_llmjudge: ''
 - obqa:
    name: OpenBookQA
    category: Knowledge
    paper: https://arxiv.org/pdf/1809.02789v1
    configpath: opencompass/configs/datasets/obqa/obqa_gen.py
    configpath_llmjudge: ''
 - olymmath:
    name: OlymMATH
    category: Math
    paper: https://arxiv.org/abs/2503.21380
    configpath: ''
    configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
 - piqa:
    name: OpenBookQA
    category: Knowledge / Physics
    paper: https://arxiv.org/pdf/1911.11641v1
    configpath: opencompass/configs/datasets/piqa/piqa_gen.py
    configpath_llmjudge: ''
 - py150:
    name: py150
    category: Code
    paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
    configpath: opencompass/configs/datasets/py150/py150_gen.py
    configpath_llmjudge: ''
 - qasper:
    name: Qasper
    category: Long Context
    paper: https://arxiv.org/pdf/2105.03011
    configpath: opencompass/configs/datasets/qasper/qasper_gen.py
    configpath_llmjudge: ''
 - qaspercut:
    name: Qasper-Cut
    category: Long Context
    paper: ''
    configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
    configpath_llmjudge: ''
 - race:
    name: RACE
    category: Examination
    paper: https://arxiv.org/pdf/1704.04683
    configpath: opencompass/configs/datasets/race/race_gen.py
    configpath_llmjudge: ''
 - realtoxicprompts:
    name: RealToxicPrompts
    category: Safety
    paper: https://arxiv.org/pdf/2009.11462
    configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
    configpath_llmjudge: ''
 - record:
    name: SuperGLUE / ReCoRD
    category: Understanding
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
    configpath_llmjudge: ''
 - rte:
    name: SuperGLUE / RTE
    category: Reasoning
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
    configpath_llmjudge: ''
 - ocnli:
    name: CLUE / OCNLI
    category: Reasoning
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
    configpath_llmjudge: ''
 - ocnlifc:
    name: FewCLUE / OCNLI-FC
    category: Reasoning
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
    configpath_llmjudge: ''
 - rolebench:
    name: RoleBench
    category: Role Play
    paper: https://arxiv.org/pdf/2310.00746
    configpath: opencompass/configs/datasets/rolebench
    configpath_llmjudge: ''
 - s3eval:
    name: S3Eval
    category: Long Context
    paper: https://aclanthology.org/2024.naacl-long.69.pdf
    configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
    configpath_llmjudge: ''
 - scibench:
    name: SciBench
    category: Reasoning
    paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
    configpath: opencompass/configs/datasets/scibench/scibench_gen.py
    configpath_llmjudge: ''
 - scicode:
    name: SciCode
    category: Code
    paper: https://arxiv.org/pdf/2407.13168
    configpath: opencompass/configs/datasets/scicode/scicode_gen.py
    configpath_llmjudge: ''
 - simpleqa:
    name: SimpleQA
    category: Knowledge
    paper: https://arxiv.org/pdf/2411.04368
    configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
    configpath_llmjudge: ''
 - siqa:
    name: SocialIQA
    category: Reasoning
    paper: https://arxiv.org/pdf/1904.09728
    configpath: opencompass/configs/datasets/siqa/siqa_gen.py
    configpath_llmjudge: ''
 - squad20:
    name: SQuAD2.0
    category: Understanding
    paper: https://arxiv.org/pdf/1806.03822
    configpath: opencompass/configs/datasets/squad20/squad20_gen.py
    configpath_llmjudge: ''
 - storycloze:
    name: StoryCloze
    category: Reasoning
    paper: https://aclanthology.org/2022.emnlp-main.616.pdf
    configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
    configpath_llmjudge: ''
 - strategyqa:
    name: StrategyQA
    category: Reasoning
    paper: https://arxiv.org/pdf/2101.02235
    configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
    configpath_llmjudge: ''
 - summedits:
    name: SummEdits
    category: Language
    paper: https://aclanthology.org/2023.emnlp-main.600.pdf
    configpath: opencompass/configs/datasets/summedits/summedits_gen.py
    configpath_llmjudge: ''
 - summscreen:
    name: SummScreen
    category: Understanding
    paper: https://arxiv.org/pdf/2104.07091v1
    configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
    configpath_llmjudge: ''
 - svamp:
    name: SVAMP
    category: Math
    paper: https://aclanthology.org/2021.naacl-main.168.pdf
    configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
    configpath_llmjudge: ''
 - tabmwp:
    name: TabMWP
    category: Math / Table
    paper: https://arxiv.org/pdf/2209.14610
    configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
    configpath_llmjudge: ''
 - taco:
    name: TACO
    category: Code
    paper: https://arxiv.org/pdf/2312.14852
    configpath: opencompass/configs/datasets/taco/taco_gen.py
    configpath_llmjudge: ''
 - tnews:
    name: FewCLUE / TNEWS
    category: Understanding
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
    configpath_llmjudge: ''
 - bustm:
    name: FewCLUE / BUSTM
    category: Reasoning
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
    configpath_llmjudge: ''
 - csl:
    name: FewCLUE / CSL
    category: Understanding
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
    configpath_llmjudge: ''
 - ocnli_fc:
    name: FewCLUE / OCNLI-FC
    category: Reasoning
    paper: https://arxiv.org/pdf/2107.07498
    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
    configpath_llmjudge: ''
 - triviaqa:
    name: TriviaQA
    category: Knowledge
    paper: https://arxiv.org/pdf/1705.03551v2
    configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
    configpath_llmjudge: ''
 - triviaqarc:
    name: TriviaQA-RC
    category: Knowledge / Understanding
    paper: ''
    configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
    configpath_llmjudge: ''
 - truthfulqa:
    name: TruthfulQA
    category: Safety
    paper: https://arxiv.org/pdf/2109.07958v2
    configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
    configpath_llmjudge: ''
 - tydiqa:
    name: TyDi-QA
    category: Language
    paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
    configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
    configpath_llmjudge: ''
 - wic:
    name: SuperGLUE / WiC
    category: Language
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
    configpath_llmjudge: ''
 - wsc:
    name: SuperGLUE / WSC
    category: Language / WSC
    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
    configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
    configpath_llmjudge: ''
 - winogrande:
    name: WinoGrande
    category: Language / WSC
    paper: https://arxiv.org/pdf/1907.10641v2
    configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
    configpath_llmjudge: ''
 - xcopa:
    name: XCOPA
    category: Language
    paper: https://arxiv.org/pdf/2005.00333
    configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
    configpath_llmjudge: ''
 - xiezhi:
    name: Xiezhi
    category: Knowledge
    paper: https://arxiv.org/pdf/2306.05783
    configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
    configpath_llmjudge: ''
 - xlsum:
    name: XLSum
    category: Understanding
    paper: https://arxiv.org/pdf/2106.13822v1
    configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
    configpath_llmjudge: ''
 - xsum:
    name: Xsum
    category: Understanding
    paper: https://arxiv.org/pdf/1808.08745
    configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
    configpath_llmjudge: ''
 - cola:
    name: GLUE / CoLA
    category: Understanding
    paper: https://arxiv.org/pdf/1804.07461
    configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
    configpath_llmjudge: ''
 - mprc:
    name: GLUE / MPRC
    category: Understanding
    paper: https://arxiv.org/pdf/1804.07461
    configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
    configpath_llmjudge: ''
 - qqp:
    name: GLUE / QQP
    category: Understanding
    paper: https://arxiv.org/pdf/1804.07461
    configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
    configpath_llmjudge: ''
 - omni_math:
    name: Omni-MATH
    category: Math
    paper: https://omni-math.github.io/
    configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
    configpath_llmjudge: ''
 - wikibench:
    name: WikiBench
    category: Knowledge
    paper: ''
    configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
    configpath_llmjudge: ''
 - supergpqa:
    name: SuperGPQA
    category: Knowledge
    paper: https://arxiv.org/pdf/2502.14739
    configpath: opencompass/configs/datasets/supergpqa
    configpath_llmjudge: ''
--- a/docs/en/_static/js/custom.js
+++ b/docs/en/_static/js/custom.js
@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['Dataset Statistics'];
 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
    "stateSave": false,
    "lengthChange": false,
    "pageLength": 20,
-    "order": []
+    "order": [],
    "language": {
      "info": "Show _START_ to _END_ Items（Totally _TOTAL_ ）",
      "infoFiltered": "（Filtered from _MAX_ Items）",
      "search": "Search：",
      "zeroRecords": "Item Not Found",
      "paginate": {
        "next": "Next",
        "previous": "Previous"
      },
    }
  });
 });
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -0,0 +1,370 @@
 # LLM as Judge Evaluation
 ## Introduction
 The GenericLLMEvaluator is particularly useful for scenarios where rule-based methods (like regular expressions) cannot perfectly judge outputs, such as:
 - Cases where models output answer content without option identifiers
 - Factual judgment datasets that are difficult to evaluate with rules
 - Open-ended responses requiring complex understanding and reasoning
 - Evaluation that requires a lot of rules to be designed
 OpenCompass provides the GenericLLMEvaluator component to facilitate LLM-as-judge evaluations.
 ## Dataset Format
 The dataset for LLM judge evaluation should be in either JSON Lines (.jsonl) or CSV format. Each entry should contain at least:
 - A problem or question
 - A reference answer or gold standard
 - (The model's prediction will be generated during evaluation)
 Example JSONL format:
 ```json
 {"problem": "What is the capital of France?", "answer": "Paris"}
 ```
 Example CSV format:
 ```csv
 problem,answer
 "What is the capital of France?","Paris"
 ```
 ## Configuration
 ### Using LLM for Evaluation via Command Line
 Some datasets in OpenCompass already include LLM judge configurations.
 You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
 Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
 ```bash
 export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
 export OC_JUDGE_API_KEY=sk-1234
 export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
 ```
 Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
 ### Using LLM for Evaluation via Configuration Files
 To set up an LLM judge evaluation, you'll need to configure three main components:
 1. Dataset Reader Configuration
 ```python
 reader_cfg = dict(
    input_columns=['problem'],  # Column name for the question
    output_column='answer'      # Column name for the reference answer
 )
 ```
 2. Inference Configuration
 ```python
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}',  # Template for prompting the model
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ```
 3. Evaluation Configuration with LLM Judge
 ```python
 eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,  # Using LLM as evaluator
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # Template for the judge
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='path/to/your/dataset',
            file_name='your_dataset.jsonl',
            reader_cfg=reader_cfg,
        ),
        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # Configuration for the judge model
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # Post-processing the judge's output
    ),
 )
 ```
 ## Using CustomDataset with GenericLLMEvaluator
 Here's how to set up a complete configuration for LLM judge evaluation:
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 # Import your judge model configuration
 with read_base():
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
        models as judge_model,
    )
 # Define your judge template
 JUDGE_TEMPLATE = """
 Please evaluate whether the following response correctly answers the question.
 Question: {problem}
 Reference Answer: {answer}
 Model Response: {prediction}
 Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
 """.strip()
 # Dataset reader configuration
 reader_cfg = dict(input_columns=['problem'], output_column='answer')
 # Inference configuration for the model being evaluated
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Evaluation configuration with LLM judge
 eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='path/to/your/dataset',
            file_name='your_dataset.jsonl',
            reader_cfg=reader_cfg,
        ),
        judge_cfg=judge_model[0],
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 # Dataset configuration
 datasets = [
    dict(
        type=CustomDataset,
        abbr='my-dataset',
        path='path/to/your/dataset',
        file_name='your_dataset.jsonl',
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
    )
 ]
 # Model configuration for the model being evaluated
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='model-to-evaluate',
        path='path/to/your/model',
        # ... other model configurations
    )
 ]
 # Output directory
 work_dir = './outputs/llm_judge_eval'
 ```
 ## GenericLLMEvaluator
 The GenericLLMEvaluator is designed to use an LLM as a judge for evaluating model outputs. Key features include:
 1. Flexible prompt templates for instructing the judge
 2. Support for various judge models (local or API-based)
 3. Customizable evaluation criteria through prompt engineering
 4. Post-processing of judge outputs to extract structured evaluations
 **Important Note**: The current generic version of the judge template only supports outputs in the format of "A" (correct) or "B" (incorrect), and does not support other output formats (like "CORRECT" or "INCORRECT"). This is because the post-processing function `generic_llmjudge_postprocess` is specifically designed to parse this format.
 The evaluator works by:
 1. Taking the original problem, reference answer, and model prediction
 2. Formatting them into a prompt for the judge model
 3. Parsing the judge's response to determine the evaluation result (looking for "A" or "B")
 4. Aggregating results across the dataset
 If you would like to see the full details of evaluation results, you can add `--dump-eval-details` to the command line when you start the job.
 Example evaluation output:
 ```python
 {
    'accuracy': 75.0,  # Percentage of responses judged as correct
    'details': [
        {
            'origin_prompt': """
            Please evaluate whether the following response correctly answers the question.
            Question: What is the capital of France?
            Reference Answer: Paris
            Model Response: Paris
            Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
 """,
            'gold': 'Paris',
            'prediction': 'A',
        },
        # ... more results
    ]
 }
 ```
 ## CascadeEvaluator
 OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
 1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
 2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
 ### Configuring CascadeEvaluator
 Here's an example of how to configure the CascadeEvaluator:
 ```python
 # Define a rule-based evaluator
 rule_evaluator = dict(type=MATHEvaluator)
 # Define an LLM judge evaluator
 llm_judge_evaluator = dict(
    type=GenericLLMEvaluator,
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                )
            ],
            round=[
                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
            ],
        ),
    ),
    dataset_cfg=dict(
        type=YourDataset,
        path='path/to/your/dataset',
        reader_cfg=reader_cfg,
    ),
    judge_cfg=dict(),  # Can use environment variables to configure the judge model
 )
 # Configure cascade evaluator (cascade mode)
 cascade_evaluator = dict(
    type=CascadeEvaluator,
    llm_evaluator=llm_judge_evaluator,
    rule_evaluator=rule_evaluator,
    parallel=False  # Cascade mode
 )
 # For parallel mode, set parallel=True
 parallel_evaluator = dict(
    type=CascadeEvaluator,
    llm_evaluator=llm_judge_evaluator,
    rule_evaluator=rule_evaluator,
    parallel=True  # Parallel mode
 )
 # Use the cascade evaluator in your dataset evaluation config
 eval_cfg = dict(evaluator=cascade_evaluator)
 ```
 ### Evaluation Results
 The cascade evaluator outputs detailed evaluation statistics including:
 - Accuracy of the rule-based evaluation
 - Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
 - Final combined accuracy
 Example output:
 ```python
 {
    'accuracy': 85.0,  # Final accuracy
    'cascade_stats': {
        'total_samples': 100,
        'rule_correct': 70,  # Number of samples correct by rule evaluation
        'rule_accuracy': 70.0,  # Accuracy of rule evaluation
        'llm_evaluated': 30,  # Number of samples evaluated by LLM (failed samples in cascade mode)
        'llm_correct': 15,  # Number of samples correct by LLM evaluation
        'llm_accuracy': 50.0,  # Accuracy of LLM evaluation
        'final_correct': 85,  # Total correct samples
        'final_accuracy': 85.0,  # Final accuracy
        'parallel_mode': False,  # Whether parallel mode was used
    },
    'details': [
        # Detailed evaluation results for each sample
    ]
 }
 ```
 The cascade evaluator is particularly useful for:
 1. Scenarios that require balancing evaluation cost and accuracy
 2. Cases where rule-based evaluators are available but might not be comprehensive
 3. Evaluation tasks that need more nuanced judgment for edge cases
 ## Complete Example
 For a complete working example using GenericLLMEvaluator
 , refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
 For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
--- a/docs/en/advanced_guides/math_verify.md
+++ b/docs/en/advanced_guides/math_verify.md
@ -0,0 +1,190 @@
 # General Math Evaluation Guidance
 ## Introduction
 Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
 ## Dataset Format
 The math evaluation dataset should be in either JSON Lines (.jsonl) or CSV format. Each problem should contain at least:
 - A problem statement
 - A solution/answer (typically in LaTeX format with the final answer in \\boxed{})
 Example JSONL format:
 ```json
 {"problem": "Find the value of x if 2x + 3 = 7", "solution": "Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"}
 ```
 Example CSV format:
 ```csv
 problem,solution
 "Find the value of x if 2x + 3 = 7","Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"
 ```
 ## Configuration
 To evaluate mathematical reasoning, you'll need to set up three main components:
 1. Dataset Reader Configuration
 ```python
 math_reader_cfg = dict(
    input_columns=['problem'],  # Column name for the question
    output_column='solution'    # Column name for the answer
 )
 ```
 2. Inference Configuration
 ```python
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ```
 3. Evaluation Configuration
 ```python
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator),
 )
 ```
 ## Using CustomDataset
 Here's how to set up a complete configuration for math evaluation:
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 math_datasets = [
    dict(
        type=CustomDataset,
        abbr='my-math-dataset',              # Dataset abbreviation
        path='path/to/your/dataset',         # Path to your dataset file
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
 ```
 ## MATHEvaluator
 The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
 The MATHEvaluator implements:
 1. Extracts answers from both predictions and references using LaTeX extraction
 2. Handles various LaTeX formats and environments
 3. Verifies mathematical equivalence between predicted and reference answers
 4. Provides detailed evaluation results including:
   - Accuracy score
   - Detailed comparison between predictions and references
   - Parse results of both predicted and reference answers
 The evaluator supports:
 - Basic arithmetic operations
 - Fractions and decimals
 - Algebraic expressions
 - Trigonometric functions
 - Roots and exponents
 - Mathematical symbols and operators
 Example evaluation output:
 ```python
 {
    'accuracy': 85.0,  # Percentage of correct answers
    'details': [
        {
            'predictions': 'x = 2',           # Parsed prediction
            'references': 'x = 2',         # Parsed reference
            'correct': True            # Whether they match
        },
        # ... more results
    ]
 }
 ```
 ## Complete Example
 Here's a complete example of how to set up math evaluation:
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 # Dataset reader configuration
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 # Inference configuration
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Evaluation configuration
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator),
 )
 # Dataset configuration
 math_datasets = [
    dict(
        type=CustomDataset,
        abbr='my-math-dataset',
        path='path/to/your/dataset.jsonl',  # or .csv
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
 # Model configuration
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='your-model-name',
        path='your/model/path',
        # ... other model configurations
    )
 ]
 # Output directory
 work_dir = './outputs/math_eval'
 ```
--- a/docs/en/advanced_guides/new_dataset.md
+++ b/docs/en/advanced_guides/new_dataset.md
@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
        return dataset
   ```
 3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
   - The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
   ```
   - mydataset:
       name: MyDataset
       category: Understanding
       paper: https://arxiv.org/pdf/xxxxxxx
       configpath: opencompass/configs/datasets/MyDataset
   ```
   Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.
--- a/docs/en/advanced_guides/persistence.md
+++ b/docs/en/advanced_guides/persistence.md
@ -0,0 +1,65 @@
 # Evaluation Results Persistence
 ## Introduction
 Normally, the evaluation results of OpenCompass will be saved to your work directory. But in some cases, there may be a need for data sharing among users or quickly browsing existing public evaluation results. Therefore, we provide an interface that can quickly transfer evaluation results to external public data stations, and on this basis, provide functions such as uploading, overwriting, and reading.
 ## Quick Start
 ### Uploading
 By adding `args` to the evaluation command or adding configuration in the Eval script, the results of evaluation can be stored in the path you specify. Here are the examples:
 (Approach 1) Add an `args` option to the command and specify your public path address.
 ```bash
 opencompass  ...  -sp '/your_path'
 ```
 (Approach 2) Add configuration in the Eval script.
 ```pythonE
 station_path = '/your_path'
 ```
 ### Overwriting
 The above storage method will first determine whether the same task result already exists in the data station based on the `abbr` attribute in the model and dataset configuration before uploading data. If results already exists, cancel this storage. If you need to update these results, please add the `station-overwrite` option to the command, here is an example:
 ```bash
 opencompass  ...  -sp '/your_path' --station-overwrite
 ```
 ### Reading
 You can directly read existing results from the data station to avoid duplicate evaluation tasks. The read results will directly participate in the 'summarize' step. When using this configuration, only tasks that do not store results in the data station will be initiated. Here is an example:
 ```bash
 opencompass  ...  -sp '/your_path' --read-from-station
 ```
 ### Command Combination
 1. Only upload the results under your latest working directory to the data station, without supplementing tasks that missing results:
 ```bash
 opencompass  ...  -sp '/your_path' -r latest -m viz
 ```
 ## Storage Format of the Data Station
 In the data station, the evaluation results are stored as `json` files for each `model-dataset` pair. The specific directory form is `/your_path/dataset_name/model_name.json `. Each `json` file stores a dictionary corresponding to the results, including `predictions`, `results`, and `cfg`, here is an example:
 ```pythonE
 Result = {
    'predictions': List[Dict],
    'results': Dict,
    'cfg': Dict = {
        'models': Dict,
        'datasets': Dict,
        (Only subjective datasets)'judge_models': Dict
    }
 }
 ```
 Among this three keys, `predictions` records the predictions of the model on each item of data in the dataset. `results` records the total score of the model on the dataset. `cfg` records detailed configurations of the model and the dataset in this evaluation task.
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]
 html_context = {
    'github_version': 'main',
 }
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
@ -220,3 +224,11 @@ autodoc_typehints = 'none'
 # The not found page
 notfound_template = '404.html'
 def builder_inited_handler(app):
    subprocess.run(['./statis.py'])
 def setup(app):
    app.connect('builder-inited', builder_inited_handler)
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@ -39,8 +39,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   user_guides/evaluation.md
   user_guides/experimentation.md
   user_guides/metrics.md
-   user_guides/summarizer.md
+   user_guides/deepseek_r1.md
   user_guides/corebench.md
 .. _Prompt:
 .. toctree::
@ -62,16 +61,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   advanced_guides/custom_dataset.md
   advanced_guides/new_model.md
   advanced_guides/evaluation_lmdeploy.md
   advanced_guides/evaluation_lightllm.md
   advanced_guides/accelerator_intro.md
   advanced_guides/math_verify.md
   advanced_guides/llm_judge.md
   advanced_guides/code_eval.md
   advanced_guides/code_eval_service.md
   advanced_guides/prompt_attack.md
   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
+   advanced_guides/persistence.md
   advanced_guides/contamination_eval.md
   advanced_guides/needleinahaystack_eval.md
 .. _Tools:
 .. toctree::
@ -80,6 +76,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
   tools.md
 .. _Dataset List:
 .. toctree::
   :maxdepth: 1
   :caption: Dataset List
   dataset_statistics.md
 .. _Notes:
 .. toctree::
   :maxdepth: 1
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@ -0,0 +1,103 @@
 #! /usr/bin/env python
 from pathlib import Path
 import yaml
 from tabulate import tabulate
 OC_ROOT = Path(__file__).absolute().parents[2]
 GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
 DATASETZOO_TEMPLATE = """\
 # Dataset Statistics
 On this page, we have listed all the datasets supported by OpenCompass.
 You can use sorting and search functions to find the dataset you need.
 We provide recommended running configurations for each dataset,
 and in some datasets also offer recommended configurations based on LLM Judge.
 You can quickly start evaluation tasks based on the recommended configurations.
 However, please note that these configurations may be updated over time.
 """
 with open('dataset_statistics.md', 'w') as f:
    f.write(DATASETZOO_TEMPLATE)
 load_path = str(OC_ROOT / 'dataset-index.yml')
 with open(load_path, 'r') as f2:
    data_list = yaml.load(f2, Loader=yaml.FullLoader)
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
    'mmlu_pro', 'musr', 'math500'
 ]
 def table_format(data_list):
    table_format_list = []
    for i in data_list:
        table_format_list_sub = []
        for j in i:
            if j in recommanded_dataset_list:
                link_token = '[link]('
            else:
                link_token = '[link(TBD)]('
            for index in HEADER:
                if index == 'paper':
                    table_format_list_sub.append('[link](' + i[j][index] + ')')
                elif index == 'configpath_llmjudge':
                    if i[j][index] == '':
                        table_format_list_sub.append(i[j][index])
                    else:
                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                elif index == 'configpath':
                    if isinstance(i[j][index], list):
                        sub_list_text = ''
                        for k in i[j][index]:
                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                              ') / ')
                        table_format_list_sub.append(sub_list_text[:-2])
                    else:
                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                else:
                    table_format_list_sub.append(i[j][index])
        table_format_list.append(table_format_list_sub)
    return table_format_list
 data_format_list = table_format(data_list)
 def generate_table(data_list, title=None):
    with open('dataset_statistics.md', 'a') as f:
        if title is not None:
            f.write(f'\n{title}')
        f.write("""\n```{table}\n:class: dataset\n""")
        header = [
            'Name', 'Category', 'Paper or Repository', 'Recommended Config',
            'Recommended Config (LLM Judge)'
        ]
        table_cfg = dict(tablefmt='pipe',
                         floatfmt='.2f',
                         numalign='right',
                         stralign='center')
        f.write(tabulate(data_list, header, **table_cfg))
        f.write('\n```\n')
 generate_table(
    data_list=data_format_list,
    title='## Supported Dataset List',
 )
--- a/docs/en/user_guides/datasets.md
+++ b/docs/en/user_guides/datasets.md
@ -81,3 +81,43 @@ datasets += cmnli_datasets
 Users can choose different abilities, different datasets and different evaluation methods configuration files to build the part of the dataset in the evaluation script according to their needs.
 For information on how to start an evaluation task and how to evaluate self-built datasets, please refer to the relevant documents.
 ### Multiple Evaluations on the Dataset
 In the dataset configuration, you can set the parameter `n` to perform multiple evaluations on the same dataset and return the average metrics, for example:
 ```python
 afqmc_datasets = [
    dict(
        abbr="afqmc-dev",
        type=AFQMCDatasetV2,
        path="./data/CLUE/AFQMC/dev.json",
        n=10, # Perform 10 evaluations
        reader_cfg=afqmc_reader_cfg,
        infer_cfg=afqmc_infer_cfg,
        eval_cfg=afqmc_eval_cfg,
    ),
 ]
 ```
 Additionally, for binary evaluation metrics (such as accuracy, pass-rate, etc.), you can also set the parameter `k` in conjunction with `n` for [G-Pass@k](http://arxiv.org/abs/2412.13147) evaluation. The formula for G-Pass@k is:
 ```{math}
 \text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
 ```
 where $n$ is the number of evaluations, and $c$ is the number of times that passed or were correct out of $n$ runs. An example configuration is as follows:
 ```python
 aime2024_datasets = [
    dict(
        abbr='aime2024',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        k=[2, 4], # Return results for G-Pass@2 and G-Pass@4
        n=12, # 12 evaluations
        ...
    )
 ]
 ```
--- a/docs/en/user_guides/deepseek_r1.md
+++ b/docs/en/user_guides/deepseek_r1.md
@ -0,0 +1,192 @@
 # Tutorial for Evaluating Reasoning Models
 OpenCompass provides an evaluation tutorial for DeepSeek R1 series reasoning models (mathematical datasets).
 - At the model level, we recommend using the sampling approach to reduce repetitions caused by greedy decoding
 - For datasets with limited samples, we employ multiple evaluation runs and take the average
 - For answer validation, we utilize LLM-based verification to reduce misjudgments from rule-based evaluation
 ## Installation and Preparation
 Please follow OpenCompass's installation guide.
 ## Evaluation Configuration Setup
 We provide example configurations in `examples/eval_deepseek_r1.py`. Below is the configuration explanation:
 ### Configuration Interpretation
 #### 1. Dataset and Validator Configuration
 ```python
 # Configuration supporting multiple runs (example)
 from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
 datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
 )
 # LLM validator configuration. Users need to deploy API services via LMDeploy/vLLM/SGLang or use OpenAI-compatible endpoints
 verifier_cfg = dict(
    abbr='qwen2-5-32B-Instruct',
    type=OpenAISDK,
    path='Qwen/Qwen2.5-32B-Instruct',  # Replace with actual path
    key='YOUR_API_KEY',  # Use real API key
    openai_api_base=['http://your-api-endpoint'],  # Replace with API endpoint
    query_per_second=16,
    batch_size=1024,
    temperature=0.001,
    max_out_len=16384
 )
 # Apply validator to all datasets
 for item in datasets:
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
 ```
 #### 2. Model Configuration
 We provided an example of evaluation based on LMDeploy as the reasoning model backend, users can modify path (i.e., HF path)
 ```python
 # LMDeploy model configuration example
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
        gen_config=dict(
            do_sample=True,
            temperature=0.6,
            top_p=0.95,
            max_new_tokens=32768
        ),
        max_seq_len=32768,
        batch_size=64,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
    # Extendable 14B/32B configurations...
 ]
 ```
 #### 3. Evaluation Process Configuration
 ```python
 # Inference configuration
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 # Evaluation configuration
 eval = dict(
    partitioner=dict(type=NaivePartitioner, n=8),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
 ```
 #### 4. Summary Configuration
 ```python
 # Multiple runs results average configuration
 summary_groups = [
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    # Other dataset average configurations...
 ]
 summarizer = dict(
    dataset_abbrs=[
        ['AIME2024-Aveage8', 'naive_average'],
        # Other dataset metrics...
    ],
    summary_groups=summary_groups
 )
 # Work directory configuration
 work_dir = "outputs/deepseek_r1_reasoning"
 ```
 ## Evaluation Execution
 ### Scenario 1: Model loaded on 1 GPU, data evaluated by 1 worker, using a total of 1 GPU
 ```bash
 opencompass examples/eval_deepseek_r1.py --debug --dump-eval-details
 ```
 Evaluation logs will be output in the command line.
 ### Scenario 2: Model loaded on 1 GPU, data evaluated by 8 workers, using a total of 8 GPUs
 You need to modify the `infer` configuration in the configuration file and set `num_worker` to 8
 ```python
 # Inference configuration
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 ```
 At the same time, remove the `--debug` parameter from the evaluation command
 ```bash
 opencompass examples/eval_deepseek_r1.py --dump-eval-details
 ```
 In this mode, OpenCompass will use multithreading to start `$num_worker` tasks. Specific logs will not be displayed in the command line, instead, detailed evaluation logs will be shown under `$work_dir`.
 ### Scenario 3: Model loaded on 2 GPUs, data evaluated by 4 workers, using a total of 8 GPUs
 Note that in the model configuration, `num_gpus` in `run_cfg` needs to be set to 2 (if using an inference backend, parameters such as `tp` in LMDeploy also need to be modified accordingly to 2), and at the same time, set `num_worker` in the `infer` configuration to 4
 ```python
 models += [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-14b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
        gen_config=dict(
                        do_sample=True,
                        temperature=0.6,
                        top_p=0.95,
                        max_new_tokens=32768),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=128,
        run_cfg=dict(num_gpus=2),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
 ]
 ```
 ```python
 # Inference configuration
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 ```
 ### Evaluation Results
 The evaluation results are displayed as follows:
 ```bash
 dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
 ```
 ## Performance Baseline
 Since the model uses Sampling for decoding, and the AIME dataset size is small, there may still be a performance fluctuation of 1-3 points even when averaging over 8 evaluations.
 | Model                        | Dataset  | Metric   | Value |
 | ---------------------------- | -------- | -------- | ----- |
 | DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3  |
 | DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2  |
 | DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2  |
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@ -57,7 +57,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.
+- `--dump-eval-details`: Default enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。
 Using run mode `-m all` as an example, the overall execution flow is as follows:
--- a/docs/zh_cn/_static/js/custom.js
+++ b/docs/zh_cn/_static/js/custom.js
@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['数据集统计'];
 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
    "stateSave": false,
    "lengthChange": false,
    "pageLength": 20,
-    "order": []
+    "order": [],
    "language": {
      "info": "显示 _START_ 至 _END_ 条目（总计 _TOTAL_ ）",
      "infoFiltered": "（筛选自 _MAX_ 条目）",
      "search": "搜索：",
      "zeroRecords": "没有找到任何条目",
      "paginate": {
        "next": "下一页",
        "previous": "上一页"
      },
    }
  });
 });
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -0,0 +1,368 @@
 # LLM 作为评判器
 ## 简介
 GenericLLMEvaluator组件特别适用于那些难以通过规则式方法（如正则表达式）进行完美判断的场景，例如：
 - 模型不输出选项标识而只输出选项内容的情况
 - 需要事实性判断的数据集
 - 需要复杂理解和推理的开放式回答
 - 需要设计大量规则的判断
 OpenCompass提供了GenericLLMEvaluator组件来实现LLM作为评判器的评估。
 ## 数据集格式
 用于LLM评判的数据集应该是JSON Lines (.jsonl)或CSV格式。每个条目至少应包含：
 - 问题或任务
 - 参考答案或标准答案
 - (模型的预测将在评估过程中生成)
 JSONL格式示例：
 ```json
 {"problem": "法国的首都是什么？", "answer": "巴黎"}
 ```
 CSV格式示例：
 ```csv
 problem,answer
 "法国的首都是什么？","巴黎"
 ```
 ## 配置说明
 ### 基于命令行使用LLM进行评估
 OpenCompass中部分数据集已经包含了LLM评判器的配置。
 你需要使用一个模型服务（如OpenAI或DeepSeek官方提供的API）或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
 然后，你可以通过以下命令设置相关评估服务的环境变量，并对模型进行评估：
 ```bash
 export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
 export OC_JUDGE_API_KEY=sk-1234
 export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 
 ```
 注意，默认情况下，OpenCompass会使用这三个环境变量，但如果你使用了基于配置文件的方式配置评估服务，这三个环境变量将不会生效。
 ### 基于配置文件使用LLM进行评估
 对一个数据集设置LLM评判评估，你需要配置三个主要组件：
 1. 数据集读取配置
 ```python
 reader_cfg = dict(
    input_columns=['problem'],  # 问题列的名称
    output_column='answer'      # 参考答案列的名称
 )
 ```
 2. 推理配置
 ```python
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}',  # 提示模型的模板
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ```
 3. 使用LLM评判器的评估配置
 ```python
 eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,  # 使用LLM作为评估器
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # 评判器的模板
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='path/to/your/dataset',
            file_name='your_dataset.jsonl',
            reader_cfg=reader_cfg,
        ),
        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # 评判模型的配置
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # 处理评判器输出的后处理器
    ),
 )
 ```
 ## 使用CustomDataset和GenericLLMEvaluator
 以下是如何设置完整的LLM评判评估配置：
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 # 导入评判模型配置
 with read_base():
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
        models as judge_model,
    )
 # 定义评判模板
 JUDGE_TEMPLATE = """
 请评估以下回答是否正确地回答了问题。
 问题：{problem}
 参考答案：{answer}
 模型回答：{prediction}
 模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。
 """.strip()
 # 数据集读取配置
 reader_cfg = dict(input_columns=['problem'], output_column='answer')
 # 被评估模型的推理配置
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # 使用LLM评判器的评估配置
 eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='path/to/your/dataset',
            file_name='your_dataset.jsonl',
            reader_cfg=reader_cfg,
        ),
        judge_cfg=judge_model[0],
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 # 数据集配置
 datasets = [
    dict(
        type=CustomDataset,
        abbr='my-dataset',
        path='path/to/your/dataset',
        file_name='your_dataset.jsonl',
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
    )
 ]
 # 被评估模型的配置
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='model-to-evaluate',
        path='path/to/your/model',
        # ... 其他模型配置
    )
 ]
 # 输出目录
 work_dir = './outputs/llm_judge_eval'
 ```
 ## GenericLLMEvaluator
 GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。主要特点包括：
 1. 灵活的提示模板，用于指导评判器
 2. 支持各种评判模型（本地或基于API）
 3. 通过提示工程自定义评估标准
 4. 对评判器输出进行后处理以提取结构化评估
 **重要说明**：目前通用版本的评判模板只支持输出"A"（正确）或"B"（不正确）的格式，不支持其他输出格式（如"正确"或"不正确"）。这是因为后处理函数`generic_llmjudge_postprocess`专门设计为解析这种格式。
 评估器的工作原理：
 1. 获取原始问题、参考答案和模型预测
 2. 将它们格式化为评判模型的提示
 3. 解析评判器的响应以确定评估结果（寻找"A"或"B"）
 4. 汇总整个数据集的结果
 如果需要查看评估的详细结果，可以在启动任务时添加`--dump-eval-details`到命令行。
 评估输出示例：
 ```python
 {
    'accuracy': 75.0,  # 被判断为正确的回答百分比
    'details': [
        {
            'origin_prompt': """
            请评估以下回答是否正确地回答了问题。
            问题：法国的首都是什么？
            参考答案：巴黎
            模型回答：法国的首都是巴黎。
            模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。""",
            'gold': '巴黎',
            'prediction': 'A',
        },
        # ... 更多结果
    ]
 }
 ```
 ## 级联评估器 (CascadeEvaluator)
 OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式评估和LLM评估的优势。级联评估器有两种模式：
 1. **级联模式（Cascade Mode, parallel=False）**：首先使用规则式评估器评估所有样本，然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖，从而降低评估成本和时间。
 2. **并行模式（Parallel Mode, parallel=True）**：使用规则式评估器和LLM评判器同时评估所有样本，如果任何一个评估器认为样本是正确的，则将该样本视为正确。这种方式可以提高评估的宽容度，但可能会导致更高的成本，因为所有样本都需要LLM评估。
 ### 配置CascadeEvaluator
 以下是配置`CascadeEvaluator`的示例：
 ```python
 # 定义规则式评估器
 rule_evaluator = dict(type=MATHEvaluator)
 # 定义LLM评判器
 llm_judge_evaluator = dict(
    type=GenericLLMEvaluator,
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt="你是一个负责评估模型输出正确性和质量的助手。",
                )
            ],
            round=[
                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
            ],
        ),
    ),
    dataset_cfg=dict(
        type=YourDataset,
        path='path/to/your/dataset',
        reader_cfg=reader_cfg,
    ),
    judge_cfg=dict(),  # 可以使用环境变量配置评判模型
 )
 # 配置级联评估器（级联模式）
 cascade_evaluator = dict(
    type=CascadeEvaluator,
    llm_evaluator=llm_judge_evaluator,
    rule_evaluator=rule_evaluator,
    parallel=False  # 级联模式
 )
 # 如果需要并行模式，可以设置parallel=True
 parallel_evaluator = dict(
    type=CascadeEvaluator,
    llm_evaluator=llm_judge_evaluator,
    rule_evaluator=rule_evaluator,
    parallel=True  # 并行模式
 )
 # 在数据集评估配置中使用级联评估器
 eval_cfg = dict(evaluator=cascade_evaluator)
 ```
 ### 评估结果
 级联评估器会输出详细的评估统计信息，包括：
 - 规则评估的准确率
 - LLM评估的准确率（针对规则评估失败的样本）
 - 最终的综合准确率
 输出示例：
 ```python
 {
    'accuracy': 85.0,  # 最终准确率
    'cascade_stats': {
        'total_samples': 100,
        'rule_correct': 70,  # 规则评估认为正确的样本数
        'rule_accuracy': 70.0,  # 规则评估的准确率
        'llm_evaluated': 30,  # LLM评估的样本数（级联模式下为规则评估失败的样本数）
        'llm_correct': 15,  # LLM评估认为正确的样本数
        'llm_accuracy': 50.0,  # LLM评估的准确率
        'final_correct': 85,  # 最终正确的样本数
        'final_accuracy': 85.0,  # 最终准确率
        'parallel_mode': False,  # 是否是并行模式
    },
    'details': [
        # 每个样本的详细评估结果
    ]
 }
 ```
 级联评估器特别适用于：
 1. 需要平衡评估成本和准确性的场景
 2. 有可用的规则式评估器但可能不够完善的情况
 3. 需要对边界情况进行更精确判断的评估任务
 ## 完整示例
 如果希望了解通用LLM评判器，请参考examples目录中的`eval_llm_judge.py`文件，该示例展示了如何使用LLM评判器评估数学问题。
 如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件，该示例展示了如何使用级联评估器评估数学问题。
--- a/docs/zh_cn/advanced_guides/math_verify.md
+++ b/docs/zh_cn/advanced_guides/math_verify.md
@ -0,0 +1,190 @@
 # 数学能力评测
 ## 简介
 数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
 ## 数据集格式
 数学评测数据集应该是 JSON Lines (.jsonl) 或 CSV 格式。每个问题至少应包含：
 - 问题陈述
 - 解答/答案（通常使用 LaTeX 格式，最终答案需要用 \\boxed{} 括起来）
 JSONL 格式示例：
 ```json
 {"problem": "求解方程 2x + 3 = 7", "solution": "让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"}
 ```
 CSV 格式示例：
 ```csv
 problem,solution
 "求解方程 2x + 3 = 7","让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"
 ```
 ## 配置说明
 要进行数学推理评测，你需要设置三个主要组件：
 1. 数据集读取配置
 ```python
 math_reader_cfg = dict(
    input_columns=['problem'],  # 问题列的名称
    output_column='solution'    # 答案列的名称
 )
 ```
 2. 推理配置
 ```python
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ```
 3. 评测配置
 ```python
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator),
 )
 ```
 ## 使用 CustomDataset
 以下是如何设置完整的数学评测配置：
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 math_datasets = [
    dict(
        type=CustomDataset,
        abbr='my-math-dataset',              # 数据集简称
        path='path/to/your/dataset',         # 数据集文件路径
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
 ```
 ## MATHEvaluator
 MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
 MATHEvaluator 具有以下功能：
 1. 使用 LaTeX 提取器从预测和参考答案中提取答案
 2. 处理各种 LaTeX 格式和环境
 3. 验证预测答案和参考答案之间的数学等价性
 4. 提供详细的评测结果，包括：
   - 准确率分数
   - 预测和参考答案的详细比较
   - 预测和参考答案的解析结果
 评测器支持：
 - 基本算术运算
 - 分数和小数
 - 代数表达式
 - 三角函数
 - 根式和指数
 - 数学符号和运算符
 评测输出示例：
 ```python
 {
    'accuracy': 85.0,  # 正确答案的百分比
    'details': [
        {
            'predictions': 'x = 2',           # 解析后的预测答案
            'references': 'x = 2',         # 解析后的参考答案
            'correct': True            # 是否匹配
        },
        # ... 更多结果
    ]
 }
 ```
 ## 完整示例
 以下是设置数学评测的完整示例：
 ```python
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
 from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 # 数据集读取配置
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 # 推理配置
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # 评测配置
 math_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator),
 )
 # 数据集配置
 math_datasets = [
    dict(
        type=CustomDataset,
        abbr='my-math-dataset',
        path='path/to/your/dataset.jsonl',  # 或 .csv
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
 # 模型配置
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='your-model-name',
        path='your/model/path',
        # ... 其他模型配置
    )
 ]
 # 输出目录
 work_dir = './outputs/math_eval'
 ```
--- a/docs/zh_cn/advanced_guides/new_dataset.md
+++ b/docs/zh_cn/advanced_guides/new_dataset.md
@ -91,4 +91,16 @@
        return dataset
   ```
 3. 在完成数据集脚本和配置文件的构建后，需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息，以使其加入OpenCompass官网Doc的数据集统计列表中。
   - 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下：
   ```
   - mydataset:
       name: MyDataset
       category: Understanding
       paper: https://arxiv.org/pdf/xxxxxxx
       configpath: opencompass/configs/datasets/MyDataset
   ```
 详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程，启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。
--- a/docs/zh_cn/advanced_guides/persistence.md
+++ b/docs/zh_cn/advanced_guides/persistence.md
@ -0,0 +1,65 @@
 # 评测结果持久化
 ## 介绍
 通常情况下，OpenCompass的评测结果将会保存到工作目录下。 但在某些情况下，可能会产生用户间的数据共享，以及快速查看已有的公共评测结果等需求。 因此，我们提供了一个能够将评测结果快速转存到外部公共数据站的接口，并且在此基础上提供了对数据站的上传、更新、读取等功能。
 ## 快速开始
 ### 向数据站存储数据
 通过在CLI评测指令中添加`args`或在Eval脚本中添加配置，即可将本次评测结果存储到您所指定的路径，示例如下：
 （方式1）在指令中添加`args`选项并指定你的公共路径地址。
 ```bash
 opencompass  ...  -sp '/your_path'
 ```
 （方式2）在Eval脚本中添加配置。
 ```pythonE
 station_path = '/your_path'
 ```
 ### 向数据站更新数据
 上述存储方法在上传数据前会首先根据模型和数据集配置中的`abbr`属性来判断数据站中是否已有相同任务结果。若已有结果，则取消本次存储。如果您需要更新这部分结果，请在指令中添加`station-overwrite`选项，示例如下：
 ```bash
 opencompass  ...  -sp '/your_path' --station-overwrite
 ```
 ### 读取数据站中已有的结果
 您可以直接从数据站中读取已有的结果，以避免重复进行评测任务。读取到的结果会直接参与到`summarize`步骤。采用该配置时，仅有数据站中未存储结果的任务会被启动。示例如下：
 ```bash
 opencompass  ...  -sp '/your_path' --read-from-station
 ```
 ### 指令组合
 1. 仅向数据站上传最新工作目录下结果，不补充运行缺失结果的任务：
 ```bash
 opencompass  ...  -sp '/your_path' -r latest -m viz
 ```
 ## 数据站存储格式
 在数据站中，评测结果按照每个`model-dataset`对的结果存储为`json`文件。具体的目录组织形式为`/your_path/dataset_name/model_name.json`。每个`json`文件都存储了对应结果的字典，包括`predictions`、`results`以及`cfg`三个子项，具体示例如下：
 ```pythonE
 Result = {
    'predictions': List[Dict],
    'results': Dict,
    'cfg': Dict = {
        'models': Dict,
        'datasets': Dict,
        (Only subjective datasets)'judge_models': Dict
    }
 }
 ```
 其中，`predictions`记录了模型对数据集中每一条数据的prediction的结果，`results`记录了模型在该数据集上的评分，`cfg`记录了该评测任务中模型和数据集的详细配置。
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]
 html_context = {
    'github_version': 'main',
 }
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
@ -224,6 +228,7 @@ notfound_template = '404.html'
 def builder_inited_handler(app):
    subprocess.run(['./cp_origin_docs.sh'])
    subprocess.run(['./statis.py'])
 def setup(app):
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@ -40,8 +40,7 @@ OpenCompass 上手路线
   user_guides/evaluation.md
   user_guides/experimentation.md
   user_guides/metrics.md
-   user_guides/summarizer.md
+   user_guides/deepseek_r1.md
   user_guides/corebench.md
 .. _提示词:
 .. toctree::
@ -62,17 +61,13 @@ OpenCompass 上手路线
   advanced_guides/custom_dataset.md
   advanced_guides/new_model.md
   advanced_guides/evaluation_lmdeploy.md
   advanced_guides/evaluation_lightllm.md
   advanced_guides/accelerator_intro.md
   advanced_guides/math_verify.md
   advanced_guides/llm_judge.md
   advanced_guides/code_eval.md
   advanced_guides/code_eval_service.md
   advanced_guides/prompt_attack.md
   advanced_guides/longeval.md
   advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
+   advanced_guides/persistence.md
   advanced_guides/contamination_eval.md
   advanced_guides/compassbench_intro.md
   advanced_guides/needleinahaystack_eval.md
 .. _工具:
 .. toctree::
@ -81,6 +76,13 @@ OpenCompass 上手路线
   tools.md
 .. _数据集列表:
 .. toctree::
   :maxdepth: 1
   :caption: 数据集列表
   dataset_statistics.md
 .. _其他说明:
 .. toctree::
   :maxdepth: 1
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@ -0,0 +1,98 @@
 #! /usr/bin/env python
 from pathlib import Path
 import yaml
 from tabulate import tabulate
 OC_ROOT = Path(__file__).absolute().parents[2]
 GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
 DATASETZOO_TEMPLATE = """\
 # 数据集统计
 在本页面中，我们列举了OpenCompass所支持的所有数据集。
 你可以使用排序和搜索功能找到需要的数据集。
 我们对每一个数据集都给出了推荐的运行配置，部分数据集中还提供了基于LLM Judge的推荐配置。
 你可以基于推荐配置快速启动评测。但请注意，推荐配置可能随时间推移被更新。
 """
 with open('dataset_statistics.md', 'w') as f:
    f.write(DATASETZOO_TEMPLATE)
 load_path = str(OC_ROOT / 'dataset-index.yml')
 with open(load_path, 'r') as f2:
    data_list = yaml.load(f2, Loader=yaml.FullLoader)
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
    'mmlu_pro', 'musr', 'math500'
 ]
 def table_format(data_list):
    table_format_list = []
    for i in data_list:
        table_format_list_sub = []
        for j in i:
            if j in recommanded_dataset_list:
                link_token = '[链接]('
            else:
                link_token = '[链接(TBD)]('
            for index in HEADER:
                if index == 'paper':
                    table_format_list_sub.append('[链接](' + i[j][index] + ')')
                elif index == 'configpath_llmjudge':
                    if i[j][index] == '':
                        table_format_list_sub.append(i[j][index])
                    else:
                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                elif index == 'configpath':
                    if isinstance(i[j][index], list):
                        sub_list_text = ''
                        for k in i[j][index]:
                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                              ') / ')
                        table_format_list_sub.append(sub_list_text[:-2])
                    else:
                        table_format_list_sub.append(link_token +
                                                     GITHUB_PREFIX +
                                                     i[j][index] + ')')
                else:
                    table_format_list_sub.append(i[j][index])
        table_format_list.append(table_format_list_sub)
    return table_format_list
 data_format_list = table_format(data_list)
 def generate_table(data_list, title=None):
    with open('dataset_statistics.md', 'a') as f:
        if title is not None:
            f.write(f'\n{title}')
        f.write("""\n```{table}\n:class: dataset\n""")
        header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
        table_cfg = dict(tablefmt='pipe',
                         floatfmt='.2f',
                         numalign='right',
                         stralign='center')
        f.write(tabulate(data_list, header, **table_cfg))
        f.write('\n```\n')
 generate_table(
    data_list=data_format_list,
    title='## 支持数据集列表',
 )
--- a/docs/zh_cn/user_guides/datasets.md
+++ b/docs/zh_cn/user_guides/datasets.md
@ -81,3 +81,42 @@ datasets += cmnli_datasets
 用户可以根据需要，选择不同能力不同数据集以及不同评测方式的配置文件来构建评测脚本中数据集的部分。
 有关如何启动评测任务，以及如何评测自建数据集可以参考相关文档。
 ### 数据集多次评测
 在数据集配置中可以通过设置参数`n`来对同一数据集进行多次评测，最终返回平均指标，例如：
 ```python
 afqmc_datasets = [
    dict(
        abbr="afqmc-dev",
        type=AFQMCDatasetV2,
        path="./data/CLUE/AFQMC/dev.json",
        n=10, # 进行10次评测
        reader_cfg=afqmc_reader_cfg,
        infer_cfg=afqmc_infer_cfg,
        eval_cfg=afqmc_eval_cfg,
    ),
 ]
 ```
 另外，对于二值评测指标（例如accuracy，pass-rate等），还可以通过设置参数`k`配合`n`进行[G-Pass@k](http://arxiv.org/abs/2412.13147)评测。G-Pass@k计算公式为：
 ```{math}
 \text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
 ```
 其中 $n$ 为评测次数, $c$ 为 $n$ 次运行中通过或正确的次数。配置例子如下：
 ```python
 aime2024_datasets = [
    dict(
        abbr='aime2024',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        k=[2, 4], # 返回 G-Pass@2和G-Pass@4的结果
        n=12, # 12次评测
        ...
    )
 ]
 ```
--- a/docs/zh_cn/user_guides/deepseek_r1.md
+++ b/docs/zh_cn/user_guides/deepseek_r1.md
@ -0,0 +1,192 @@
 # 强推理模型评测教程
 OpenCompass提供针对DeepSeek R1系列推理模型的评测教程（数学数据集）。
 - 在模型层面，我们建议使用Sampling方式，以减少因为Greedy评测带来的大量重复
 - 在数据集层面，我们对数据量较小的评测基准，使用多次评测并取平均的方式。
 - 在答案验证层面，为了减少基于规则评测带来的误判，我们统一使用基于LLM验证的方式进行评测。
 ## 安装和准备
 请按OpenCompass安装教程进行安装。
 ## 构建评测配置
 我们在 `example/eval_deepseek_r1.py` 中提供了示例配置，以下对评测配置进行解读
 ### 评测配置解读
 #### 1. 数据集与验证器配置
 ```python
 # 支持多运行次数的数据集配置（示例）
 from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
 datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
 )
 # 设置LLM验证器， 用户需事先通过LMDeploy/vLLM/SGLang等工具启动API 评测服务器，或者直接使用兼容OpenAI标准接口的模型服务
 verifier_cfg = dict(
    abbr='qwen2-5-32B-Instruct',
    type=OpenAISDK,
    path='Qwen/Qwen2.5-32B-Instruct',  # 需替换实际路径
    key='YOUR_API_KEY',  # 需替换真实API Key
    openai_api_base=['http://your-api-endpoint'],  # 需替换API地址
    query_per_second=16,
    batch_size=1024,
    temperature=0.001,
    max_out_len=16384
 )
 # 应用验证器到所有数据集
 for item in datasets:
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
 ```
 #### 2. 模型配置
 我们提供了基于LMDeploy作为推理后端的评测示例，用户可以通过修改path（即HF路径）
 ```python
 # LMDeploy模型配置示例
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
        gen_config=dict(
            do_sample=True,
            temperature=0.6,
            top_p=0.95,
            max_new_tokens=32768
        ),
        max_seq_len=32768,
        batch_size=64,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
    # 可扩展14B/32B配置...
 ]
 ```
 #### 3. 评估流程配置
 ```python
 # 推理配置
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 # 评估配置
 eval = dict(
    partitioner=dict(type=NaivePartitioner, n=8),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
 ```
 #### 4. 结果汇总配置
 ```python
 # 多运行结果平均配置
 summary_groups = [
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    # 其他数据集平均配置...
 ]
 summarizer = dict(
    dataset_abbrs=[
        ['AIME2024-Aveage8', 'naive_average'],
        # 其他数据集指标...
    ],
    summary_groups=summary_groups
 )
 # 工作目录设置
 work_dir = "outputs/deepseek_r1_reasoning"
 ```
 ## 执行评测
 ### 场景1：模型1卡加载，数据1个worker评测，共使用1个GPU
 ```bash
 opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
 ```
 评测日志会在命令行输出。
 ### 场景2：模型1卡加载，数据8个worker评测，共使用8个GPU
 需要修改配置文件中的infer配置，将num_worker设置为8
 ```python
 # 推理配置
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 ```
 同时评测命令去掉`--debug`参数
 ```bash
 opencompass example/eval_deepseek_r1.py --dump-eval-details
 ```
 此模式下，OpenCompass将使用多线程启动`$num_worker`个任务，命令行不展示具体日志，具体的评测日志将会在`$work_dir`下中展示。
 ### 场景3：模型2卡加载，数据4个worker评测，共使用8个GPU
 需要注意模型配置中，`run_cfg`中的`num_gpus`需要设置为2(如使用推理后端，则推理后端的参数也需要同步修改，比如LMDeploy中的tp需要设置为2)，同时修改`infer`配置中的`num_worker`为4
 ```python
 models += [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-14b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
        gen_config=dict(
                        do_sample=True,
                        temperature=0.6,
                        top_p=0.95,
                        max_new_tokens=32768),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=128,
        run_cfg=dict(num_gpus=2),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
 ]
 ```
 ```python
 # 推理配置
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
 ```
 ### 评测结果
 评测结果展示如下：
 ```bash
 dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
 ```
 ## 性能基线参考
 由于模型使用Sampling进行解码，同时AIME数据量较小，使用8次评测取平均情况下，仍会出现1-3分的性能抖动
 | 模型                         | 数据集   | 指标     | 数值 |
 | ---------------------------- | -------- | -------- | ---- |
 | DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3 |
 | DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2 |
 | DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2 |
--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。
+- `--dump-eval-details`: 默认开启，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。如不需要开启，需设置`--dump-eval-details False`。
 以运行模式 `-m all` 为例，整体运行流如下：
--- a/examples/eval_academic_leaderboard_202502.py
+++ b/examples/eval_academic_leaderboard_202502.py
@ -0,0 +1,137 @@
 # flake8: noqa
 from mmengine.config import read_base
 from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
 from opencompass.runners import LocalRunner, VOLCRunner
 from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 #######################################################################
 #                          PART 0  Essential Configs                  #
 #######################################################################
 with read_base():
    # Datasets Part
    # Knowledge
    # Math
    from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
        aime2024_datasets
    from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
        bbh_datasets
    # General Reasoning
    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
        gpqa_datasets
    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
        humaneval_datasets
    # Instruction Following
    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
        ifeval_datasets
    from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
        LCBCodeGeneration_dataset
    from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
        math_datasets
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
        mmlu_pro_datasets
    # Model List
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model
    # Summary Groups
    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
    from opencompass.configs.summarizers.groups.mmlu_pro import \
        mmlu_pro_summary_groups
 #######################################################################
 #                          PART 1  Datasets List                      #
 #######################################################################
 # datasets list for evaluation
 # Only take LCB generation for evaluation
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
               []) + [LCBCodeGeneration_dataset]
 # LLM judge config: using LLM to evaluate predictions
 judge_cfg = dict()
 for dataset in datasets:
    dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
    if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
        dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
 #######################################################################
 #                       PART 2  Datset Summarizer                     #
 #######################################################################
 core_summary_groups = [
    {
        'name':
        'core_average',
        'subsets': [
            ['IFEval', 'Prompt-level-strict-accuracy'],
            ['bbh', 'naive_average'],
            ['math_prm800k_500', 'accuracy'],
            ['aime2024', 'accuracy'],
            ['GPQA_diamond', 'accuracy'],
            ['mmlu_pro', 'naive_average'],
            ['openai_humaneval', 'humaneval_pass@1'],
            ['lcb_code_generation', 'pass@1'],
        ],
    },
 ]
 summarizer = dict(
    dataset_abbrs=[
        ['core_average', 'naive_average'],
        '',
        'Instruction Following',
        ['IFEval', 'Prompt-level-strict-accuracy'],
        '',
        'General Reasoning',
        ['bbh', 'naive_average'],
        ['GPQA_diamond', 'accuracy'],
        '',
        'Math Calculation',
        ['math_prm800k_500', 'accuracy'],
        ['aime2024', 'accuracy'],
        '',
        'Knowledge',
        ['mmlu_pro', 'naive_average'],
        '',
        'Code',
        ['openai_humaneval', 'humaneval_pass@1'],
        ['lcb_code_generation', 'pass@1'],
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
 #######################################################################
 #                        PART 3  Models  List                         #
 #######################################################################
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 #######################################################################
 #                 PART 4  Inference/Evaluation Configuaration         #
 #######################################################################
 # Local Runner
 infer = dict(
    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
    runner=dict(
        type=LocalRunner,
        max_num_workers=16,
        retry=0,  # Modify if needed
        task=dict(type=OpenICLInferTask),
    ),
 )
 # eval with local runner
 eval = dict(
    partitioner=dict(type=NaivePartitioner, n=10),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=OpenICLEvalTask)),
 )
 #######################################################################
 #                      PART 5  Utils Configuaration                   #
 #######################################################################
 work_dir = './outputs/oc_academic_202502'
--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
@ -0,0 +1,127 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
    MATHDataset,
    math_postprocess_v2,
    normalize_final_answer,
 )
 #######################################################################
 #                          PART 0  Essential Configs                  #
 #######################################################################
 with read_base():
    # Datasets, Summarizer
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
        models as lmdeploy_qwen2_5_7b_instruct_model,
    )
 reader_cfg = dict(input_columns=['problem'], output_column='solution')
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 ########################## Evaluator  #################################
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 llm_judge_evaluator =   dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
        type=MATHDataset,
        path='opencompass/math',
        file_name='test_prm800k_500.json',
        ),
        judge_cfg=dict(),
    )
 rule_evaluator =dict(type=MATHEvaluator)
 cascade_evaluator = dict(type=CascadeEvaluator,
                   llm_evaluator=llm_judge_evaluator,
                   rule_evaluator=rule_evaluator,
                   parallel=False
                   )
 ########################## #################################
 eval_cfg = dict()
 # eval_cfg['evaluator'] = rule_evaluator
 # eval_cfg['evaluator'] = llm_judge_evaluator
 eval_cfg['evaluator'] = cascade_evaluator 
 math_datasets = [
    dict(
        abbr='math_prm800k_500',
        type=MATHDataset,
        path='opencompass/math',
        file_name='test_prm800k_500.json',
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
    )
 ]
 datasets = math_datasets
 models = lmdeploy_qwen2_5_7b_instruct_model
 work_dir = 'math_prm800k_500_cascade_evaluator'
--- a/examples/eval_deepseek_r1.py
+++ b/examples/eval_deepseek_r1.py
@ -0,0 +1,212 @@
 # Support AIME-2024 with Repeat8
 # Support MATH-500
 # Support OlympiadBench
 # Support OmniMath
 # Support LiveMathBench-202412-Hard
 import os.path as osp
 from itertools import product
 from opencompass.models import OpenAISDK
 from mmengine.config import read_base
 from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 from opencompass.runners import LocalRunner
 from opencompass.models import (
    TurboMindModelwithChatTemplate,
 )
 #######################################################################
 #                          PART 1  Datasets List                      #
 #######################################################################
 with read_base():
    # You can comment out the datasets you don't want to evaluate
    # Datasets
    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
    # Summarizer
    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
 datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
 )
 # Set LLM Verifier used for each dataset
 verifier_cfg = dict(
        abbr='qwen2-5-32B-Instruct',
        type=OpenAISDK,
        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
        key='sk-1234', # You need to set your own API key
        openai_api_base=[
            'http://172.30.56.1:4000/v1', # You need to set your own API base
        ],
        meta_template=dict(
            round=[
                dict(role='HUMAN', api_role='HUMAN'),
                dict(role='BOT', api_role='BOT', generate=True),
            ], 
        ),
        query_per_second=16,
        batch_size=1024,
        temperature=0.001,
        tokenizer_path='gpt-4o-2024-05-13',
        verbose=True,
        max_out_len=16384,
        # max_seq_len=32768,
        max_seq_len=49152,
 )
 for item in datasets:
    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
 #######################################################################
 #                          PART 2  Model List                         #
 #######################################################################
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 models += [
    # You can comment out the models you don't want to evaluate
    # All models use sampling mode
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
        gen_config=dict(
                        do_sample=True,
                        temperature=0.6,
                        top_p=0.95,
                        max_new_tokens=32768),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=64,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=32768),
    #     max_seq_len=32768,
    #     max_out_len=32768,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=2),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=16384),
    #     max_seq_len=32768,
    #     max_out_len=16384,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=4),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
 ]
 #######################################################################
 #                          PART 3  Inference/Evaluation               #
 #######################################################################
 # Inference configuration
 infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=1
        # Similar with data-parallelism, how many workers for evaluation,
        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
        # to max-utilize the GPUs.
        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
 )
 # Evaluation configuration
 eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
 )
 #######################################################################
 #                          PART 4  Summarizer                         #
 #######################################################################
 summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
 )
 summary_groups.extend([
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    {
        'name': 'LiveMathBench-v202412-Hard-Aveage8',
        'subsets':[[
            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] 
                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
        ]
    }
 ])
 # Summarizer
 summarizer = dict(
    dataset_abbrs=[
        'MATH',
        # ['LiveMathBench-k1-n1', 'pass@1'],
        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
        # ['aime2024', 'accuracy'],
        ['math_prm800k_500-llmjudge', 'accuracy'],
        ['AIME2024-Aveage8', 'naive_average'],
        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
        ['OlympiadBenchMath', 'accuracy'],
        ['OmniMath', 'accuracy'],
    ],
    summary_groups=summary_groups,
 )
 #######################################################################
 #                          PART 5  Utils                              #
 #######################################################################
 work_dir = 'outputs/deepseek_r1_reasoning'
--- a/examples/eval_dingo.py
+++ b/examples/eval_dingo.py
@ -1,7 +1,7 @@
 from mmengine.config import read_base
 with read_base():
-    from .datasets.dingo.dingo_gen import datasets
+    from opencompass.configs.datasets.dingo.dingo_gen import datasets
-    from .models.hf_internlm.hf_internlm_7b import models
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
 work_dir = './outputs/eval_dingo'
--- a/examples/eval_llm_judge.py
+++ b/examples/eval_llm_judge.py
@ -0,0 +1,116 @@
 from mmengine.config import read_base
 from opencompass.models.openai_api import OpenAISDK
 # Import pre-configured models from OpenCompass
 with read_base():
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
        models as lmdeploy_qwen2_5_7b_instruct_model,
    )
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
        models as lmdeploy_qwen2_5_14b_instruct_model,
    )
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.datasets import CustomDataset
 # Dataset reader configuration
 math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
 # Inference configuration
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Template for the LLM judge
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 # Evaluation configuration using LLM as judge
 math_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='opencompass/math',
            file_name='test_prm800k_500.jsonl',
            reader_cfg=math_reader_cfg,
        ),
        judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
 )
 # Dataset configuration
 datasets = [
    dict(
        type=CustomDataset,
        path='opencompass/math',
        file_name='test_prm800k_500.jsonl',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
 # Model to be evaluated
 models = lmdeploy_qwen2_5_7b_instruct_model
 # Limiting test to first 8 examples for quick testing
 math_reader_cfg['test_range'] = '[0:8]'
 # Output directory
 work_dir = 'outputs/llm_judge'
--- a/examples/eval_math_verify.py
+++ b/examples/eval_math_verify.py
@ -0,0 +1,77 @@
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 with read_base():
    from opencompass.configs.datasets.math.math_500_gen import math_datasets
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-llama-8b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
        gen_config=dict(
            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
        ),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=32,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content),
    ),
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
        gen_config=dict(
            temperature=0.6,
            top_p=0.95,
            max_new_tokens=32768,
            do_sample=True,
        ),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=32,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content),
    ),
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
        gen_config=dict(
            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
        ),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=32,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content),
    ),
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-14b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
        gen_config=dict(
            top_k=1,
            temperature=0.6,
            top_p=0.95,
            max_new_tokens=32768,
            do_sample=True,
        ),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=16,
        run_cfg=dict(num_gpus=2),
        pred_postprocessor=dict(type=extract_non_reasoning_content),
    ),
 ]
 datasets = [*math_datasets]
 work_dir = './outputs/math_500'
--- a/examples/eval_simpleqa.py
+++ b/examples/eval_simpleqa.py
@ -36,8 +36,8 @@ infer = dict(
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        models=[gpt_4o_2024_05_13_model],
+        models=models,
-        judge_models=[gpt_4o_2024_05_13_model],
+        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=256,
--- a/opencompass/init.py
+++ b/opencompass/init.py
@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.4.2'
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@ -12,7 +12,8 @@ from mmengine.config import Config, DictAction
 from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
 from opencompass.runners import SlurmRunner
 from opencompass.summarizers import DefaultSummarizer
-from opencompass.utils import LarkReporter, get_logger
+from opencompass.utils import (LarkReporter, get_logger, read_from_station,
                               save_to_station)
 from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
                                   get_config_from_arg)
@ -118,8 +119,11 @@ def parse_args():
    parser.add_argument(
        '--dump-eval-details',
        help='Whether to dump the evaluation details, including the '
-        'correctness of each sample, bpb, etc.',
+        'correctness of each sample, bpb, etc. Defaults to True.',
-        action='store_true',
+        nargs='?',
        const=True,
        default=True,
        type=lambda x: False if x and x.lower() == 'false' else True
    )
    parser.add_argument(
        '--dump-extract-rate',
@ -127,6 +131,27 @@ def parse_args():
        'correctness of each sample, bpb, etc.',
        action='store_true',
    )
    parser.add_argument('-sp',
        '--station-path',
        help='Path to your results station.',
        type=str,
        default=None,
    )
    parser.add_argument('--station-overwrite',
        help='Whether to overwrite the results at station.',
        action='store_true',
    )
    parser.add_argument(
        '--read-from-station',
        help='Whether to save the evaluation results to the '
             'data station.',
        action='store_true',
    )
    # set srun args
    slurm_parser = parser.add_argument_group('slurm_args')
    parse_slurm_args(slurm_parser)
@ -177,8 +202,6 @@ def parse_dlc_args(dlc_parser):
                            type=str)
 def parse_hf_args(hf_parser):
    """These args are all for the quick construction of HuggingFace models."""
    hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
@ -213,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):
 def main():
    args = parse_args()
    if args.num_gpus is not None:
        raise ValueError('The `--num-gpus` argument is deprecated, please use '
                         '`--hf-num-gpus` to describe number of gpus used for '
@ -243,9 +265,11 @@ def main():
        else:
            dir_time_str = args.reuse
        logger.info(f'Reusing experiements from {dir_time_str}')
-    elif args.mode in ['eval', 'viz']:
+    elif args.mode in ['eval', 'viz'] and not args.read_from_station:
-        raise ValueError('You must specify -r or --reuse when running in eval '
+        raise ValueError(
-                         'or viz mode!')
+            'You must specify -r or --reuse, or you have to specify '
            '--read-from-station and --station-path when running in eval '
            'or viz mode!')
    # update "actual" work_dir
    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
@ -262,6 +286,12 @@ def main():
    # types cannot be serialized
    cfg = Config.fromfile(output_config_path, format_python_code=False)
    # get existed results from station
    if args.read_from_station:
        existing_results_list = read_from_station(cfg, args)
        rs_exist_results = [comb['combination'] for comb in existing_results_list]
        cfg['rs_exist_results'] = rs_exist_results
    # report to lark bot if specify --lark
    if not args.lark:
        cfg['lark_bot_url'] = None
@ -269,6 +299,7 @@ def main():
        content = f'{getpass.getuser()}\'s task has been launched!'
        LarkReporter(cfg['lark_bot_url']).post(content)
    # infer
    if args.mode in ['all', 'infer']:
        # When user have specified --slurm or --dlc, or have not set
        # "infer" in config, we will provide a default configuration
@ -321,6 +352,9 @@ def main():
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            fill_eval_cfg(cfg, args)
        if args.dump_eval_details:
            logger.warning('Default to dump eval details, it might take extra'
                        'space to save all the evaluation details. '
                        'Set --dump-eval-details False to skip the details dump')
            cfg.eval.runner.task.dump_details = True
        if args.dump_extract_rate:
            cfg.eval.runner.task.cal_extract_rate = True
@ -350,6 +384,10 @@ def main():
        else:
            runner(tasks)
    # save to station
    if args.station_path is not None or cfg.get('station_path') is not None:
        save_to_station(cfg, args)
    # visualize
    if args.mode in ['all', 'eval', 'viz']:
        summarizer_cfg = cfg.get('summarizer', {})
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
@ -0,0 +1,56 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
 # The system_prompt defines the initial instructions for the model, 
 # setting the context for solving ARC tasks.
 system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
 # User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
 # guiding the model to learn the rule and apply it to solve the given puzzle.
 user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
 ----------------------------------------
 {training_data}
 ----------------------------------------
 Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
 ----------------------------------------
 [{{'input': {input_test_data}, 'output': [[]]}}]
 ----------------------------------------
 What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
 arc_prize_public_evaluation_reader_cfg = dict(
    input_columns=['training_data', 'input_test_data'], 
    output_column='output_test_data'
 )
 arc_prize_public_evaluation_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='SYSTEM',fallback_role='HUMAN', prompt=system_prompt),
                dict(role='HUMAN', prompt=user_message_template),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 arc_prize_public_evaluation_eval_cfg = dict(
    evaluator=dict(type=ARCPrizeEvaluator)
 )
 arc_prize_public_evaluation_datasets = [
    dict(
        abbr='ARC_Prize_Public_Evaluation',
        type=ARCPrizeDataset,
        path='opencompass/arc_prize_public_evaluation',
        reader_cfg=arc_prize_public_evaluation_reader_cfg,
        infer_cfg=arc_prize_public_evaluation_infer_cfg,
        eval_cfg=arc_prize_public_evaluation_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
+++ b/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
@ -0,0 +1,45 @@
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GaokaoBenchDataset
 from mmengine.config import read_base
 with read_base():
    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
 GaokaoBench_datasets = []
 for folder, prompts in [
    ('Multiple-choice_Questions', MCQ_prompts),
    ('Fill-in-the-blank_Questions', FBQ_prompts),
 ]:
    for p in prompts:
        reader_cfg = {
            'input_columns': ['question'],
            'output_column': 'answer',
        }
        infer_cfg = {
            'ice_template': {
                'type': PromptTemplate,
                'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
                'ice_token': '</E>',
            },
            'retriever': {'type': ZeroRetriever},
            'inferencer': {'type': GenInferencer},
        }
        eval_cfg = {
            'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
            'pred_role': 'BOT',
        }
        _base_path = 'opencompass/GAOKAO-BENCH'
        dataset = {
            'type': GaokaoBenchDataset,
            'abbr': 'GaokaoBench_' + p['keyword'],
            'path': _base_path,
            'filename': '/' + folder + '/' + p['keyword'] + '.json',
            'name': p['keyword'],
            'reader_cfg': reader_cfg,
            'infer_cfg': infer_cfg,
            'eval_cfg': eval_cfg,
        }
        GaokaoBench_datasets.append(dataset)
--- a/opencompass/configs/datasets/HLE/hle_gen.py
+++ b/opencompass/configs/datasets/HLE/hle_gen.py
@ -0,0 +1,5 @@
 from mmengine.config import read_base
 with read_base():
    # Default use LLM as a judge
    from .hle_llmverify_gen_6ff468 import hle_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
+++ b/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
@ -0,0 +1,91 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.datasets import HLEDataset
 # ----------------------------- Detailed Config -----------------------------
 math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 # Evaluation configuration
 math_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
            ],
                round=[
                dict(
                    role='HUMAN',
                    prompt = GRADER_TEMPLATE
                ),
            ]),
        ),
        dataset_cfg=dict(
            type=HLEDataset,
            path='cais/hle',
            reader_cfg=math_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 hle_datasets = [
    dict(
        type=HLEDataset,
        abbr='hle_llmjudge',
        path='cais/hle',
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/IFEval/IFEval_gen.py
+++ b/opencompass/configs/datasets/IFEval/IFEval_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .IFEval_gen_3321a3 import ifeval_datasets  # noqa: F401, F403
+    from .IFEval_gen_353ae7 import ifeval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
+++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
@ -0,0 +1,81 @@
 from mmengine.config import read_base
 from copy import deepcopy
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import MathBenchDataset, math_postprocess_v2
 from opencompass.utils.text_postprocessors import first_option_postprocess
 with read_base():
    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
 # Max for this dataset is 4
 num_shot = 0
 # Generate reasoning path or not, only for single choice
 with_reasoning = True
 # Use circular evaluation or not
 with_circular_eval = True
 # Use PPL mode in single choice test or not
 use_ppl_single_choice = False
 assert 0 <= num_shot <= 4
 if num_shot == 0:
    prompts = zero_shot_prompts
 else:
    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
 mathbench_datasets = []
 for _split in mathbench_sets:
    for _name in mathbench_sets[_split]:
        if 'single_choice' in _name:
            if with_reasoning:
                template_round = prompts[_name + '_with_reasoning']
            else:
                template_round = prompts[_name]
        else:
            template_round = prompts[_name]
        if 'single_choice' in _name:
            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
        else:
            pred_postprocessor = dict(type=math_postprocess_v2)
        if 'single_choice' in _name and with_circular_eval:
            evaluator = dict(type=CircularEvaluator)
        else:
            evaluator = dict(type=AccEvaluator)
        # assemble the final config
        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
            template = {}
            for answer in ['A', 'B', 'C', 'D']:
                one_template_round = deepcopy(template_round)
                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
                template[answer] = dict(round=one_template_round)
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=template),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=PPLInferencer),
            )
        else:
            mathbench_infer_cfg = dict(
                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
                retriever=dict(type=ZeroRetriever),
                inferencer=dict(type=GenInferencer),
            )
        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
        mathbench_datasets.append(
            dict(
                abbr='mathbench-' + _split + '-' + _name,
                type=MathBenchDataset,
                path=f'data/mathbench_v1/{_split}',
                name=_name,
                with_circular=with_circular_eval,
                reader_cfg=mathbench_reader_cfg,
                infer_cfg=mathbench_infer_cfg,
                eval_cfg=mathbench_eval_cfg,
            )
        )
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
@ -0,0 +1,57 @@
 from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
 ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
 # Reader configuration
 reader_cfg = dict(
    input_columns=[
        'question',
        'options',
        'medical_task',
        'body_system',
        'question_type',
        'prompt_mode',
    ],
    output_column='label',
 )
 # Inference configuration
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
            ],
            round=[
                dict(
                    role='HUMAN',
                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
                ),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Evaluation configuration
 eval_cfg = dict(
    evaluator=dict(type=MedXpertQAEvaluator),
    pred_role='BOT',
 )
 medxpertqa_dataset = dict(
    type=MedXpertQADataset,
    abbr='medxpertqa',
    path='TsinghuaC3I/MedXpertQA',
    prompt_mode='zero-shot',
    reader_cfg=reader_cfg,
    infer_cfg=infer_cfg,
    eval_cfg=eval_cfg,
 )
 medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
@ -0,0 +1,104 @@
 from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.evaluator import GenericLLMEvaluator
 SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
 ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 # Reader configuration
 reader_cfg = dict(
    input_columns=[
        'question',
        'options',
        'medical_task',
        'body_system',
        'question_type',
        'prompt_mode',
    ],
    output_column='label',
 )
 # Inference configuration
 infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
            ],
            round=[
                dict(
                    role='HUMAN',
                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
                ),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Evaluation configuration
 eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=MedXpertQADataset,
            path='TsinghuaC3I/MedXpertQA',
            prompt_mode='zero-shot',
            reader_cfg=reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
    ),
 )
 medxpertqa_dataset = dict(
    type=MedXpertQADataset,
    abbr='medxpertqa',
    path='TsinghuaC3I/MedXpertQA',
    prompt_mode='zero-shot',
    reader_cfg=reader_cfg,
    infer_cfg=infer_cfg,
    eval_cfg=eval_cfg,
 )
 medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/OlymMATH/README.md
+++ b/opencompass/configs/datasets/OlymMATH/README.md
@ -0,0 +1,60 @@
 # OlymMATH
 [GitHub Link](https://github.com/RUCAIBox/OlymMATH)
 Dataset OlymMATH, please refer to the paper:
 Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
 ## How to eval OlymMATH with model judge
 This is a simple example:
 ```python
 from opencompass.models import OpenAISDK, OpenAI
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
    from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
 ##################  Judge Config  ##################
 api_meta_template = dict(round=[
    dict(role='HUMAN', api_role='HUMAN'),
    dict(role='BOT', api_role='BOT', generate=True),
 ], )
 judge_cfg = dict(
    # An API model with OpenAI API format is required for Judge
        abbr='qwen2-5-32B-Instruct',
        type=OpenAISDK,
        path='Qwen/Qwen2.5-32B-Instruct',
        key='sk-1234',
        openai_api_base=[
            'http://172.30.56.1:4000/v1',
        ],
        meta_template=api_meta_template,
        query_per_second=16,
        batch_size=1024,
        temperature=0.001,
        max_completion_tokens=32768,
        tokenizer_path='gpt-4o-2024-05-13',
        verbose=True,
        max_out_len=16384,
        max_seq_len=32768,
 )
 ##################  Model Config  ##################
 models = [*qwen2_5_7b_instruct_model]
 ##################  Dataset Config  ##################
 datasets = [*olymmath_datasets]
 # Set judge_cfg for evaluation
 for item in datasets:
    item['infer_cfg']['inferencer']['max_out_len'] = 32768
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
 work_dir = './outputs/olymmath_llm_eval'
 ```
--- a/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
@ -0,0 +1,5 @@
 from mmengine.config import read_base
 with read_base():
    # Default use LLM as a judge
    from .olymmath_llmverify_gen_97b203 import olymmath_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
@ -0,0 +1,99 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.datasets import OlymMATHDataset
 # ----------------------------- Detailed Config -----------------------------
 math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
 math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 # Evaluation configuration
 olymmath_datasets = []
 for sub_set in sub_sets:
    math_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = GRADER_TEMPLATE
                    ),
                ]),
            ),
            dataset_cfg=dict(
                type=OlymMATHDataset,
                path='RUC-AIBOX/OlymMATH',
                reader_cfg=math_reader_cfg,
                subset=sub_set,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    olymmath_datasets.append(
        dict(
            type=OlymMATHDataset,
            abbr=f'olymmath_llmjudge_{sub_set}',
            path='RUC-AIBOX/OlymMATH',
            reader_cfg=math_reader_cfg,
            infer_cfg=math_infer_cfg,
            eval_cfg=math_eval_cfg,
            subset=sub_set,
        )
    )
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
@ -0,0 +1,105 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 with read_base():
    from .OlympiadBench_categories import math_categories as categories
 # Create prompter instance for problems
 olympiadbench_prompter_cfg = dict(
    type='OlympiadBenchPrompter'
 )
 olympiadbench_reader_cfg = dict(
    input_columns=[
        'problem', 'language', 'subject', 'question_type', 
        'answer_type', 'is_multiple_answer', 'unit', 'questions'
    ], 
    output_column='solution'
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 olympiadbenchMath_datasets = []
 for _name in categories:
    olympiadbench_infer_cfg = dict(
        prompt_template=dict(
            type='OlympiadBenchTemplate'
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    # Evaluation configuration
    olympiadbench_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = GRADER_TEMPLATE
                    ),
                ]),
            ),
            dataset_cfg=dict(
                type=OlympiadBenchDataset,
                path='opencompass/OlympiadBench',
                name=_name,
                reader_cfg=olympiadbench_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    olympiadbenchMath_datasets.append(
        dict(
            type=OlympiadBenchDataset,
            abbr=f'OlympiadBench_{_name}',
            path='opencompass/OlympiadBench',
            name=_name,
            reader_cfg=olympiadbench_reader_cfg,
            infer_cfg=olympiadbench_infer_cfg,
            eval_cfg=olympiadbench_eval_cfg,
        )
    )
 del _name
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
@ -0,0 +1,109 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 with read_base():
    from .OlympiadBench_categories import categories
 # Create prompter instance for problems
 olympiadbench_prompter_cfg = dict(
    type='OlympiadBenchPrompter'
 )
 olympiadbench_reader_cfg = dict(
    input_columns=[
        'problem', 'language', 'subject', 'question_type', 
        'answer_type', 'is_multiple_answer', 'unit', 'questions'
    ], 
    output_column='solution'
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 olympiadbench_datasets = []
 for _name in categories:
    olympiadbench_infer_cfg = dict(
        prompt_template=dict(
            type='OlympiadBenchTemplate'
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    # olympiadbench_eval_cfg = dict(
    #     evaluator=dict(type=OlympiadBenchEvaluator, version='v2'), 
    #     pred_postprocessor=dict(type=olympiadbench_postprocess_v2),
    # )
    # Evaluation configuration
    olympiadbench_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = GRADER_TEMPLATE
                    ),
                ]),
            ),
            dataset_cfg=dict(
                type=OlympiadBenchDataset,
                path='opencompass/OlympiadBench',
                name=_name,
                reader_cfg=olympiadbench_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    olympiadbench_datasets.append(
        dict(
            type=OlympiadBenchDataset,
            abbr=f'OlympiadBench_{_name}',
            path='opencompass/OlympiadBench',
            name=_name,
            reader_cfg=olympiadbench_reader_cfg,
            infer_cfg=olympiadbench_infer_cfg,
            eval_cfg=olympiadbench_eval_cfg,
        )
    )
 del _name
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
@ -5,3 +5,14 @@ categories = [
    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
 ]
 math_categories = [
    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
 ]
 physics_categories = [
    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
 ]
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
@ -0,0 +1,98 @@
 # flake8: noqa
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import Aime2024Dataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
 aime2024_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer'
 )
 aime2024_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN',
                     prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=2048)
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 aime2024_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                round=[
                    dict(
                        role='HUMAN',
                        prompt=GRADER_TEMPLATE),
                ]),
        ),
        dataset_cfg=dict(
            type=Aime2024Dataset,
            path='opencompass/aime2024',
            reader_cfg=aime2024_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
                                metric_name='accuracy'),
    ),
    pred_role='BOT',
 )
 aime2024_datasets = [
    dict(
        abbr='aime2024',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        reader_cfg=aime2024_reader_cfg,
        infer_cfg=aime2024_infer_cfg,
        eval_cfg=aime2024_eval_cfg,
        mode='singlescore',
    )
 ]
--- a/opencompass/configs/datasets/aime2024/aime2024_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .aime2024_gen_6e39a4 import aime2024_datasets  # noqa: F401, F403
+    from .aime2024_gen_17d799 import aime2024_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
@ -0,0 +1,40 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import Aime2024Dataset
 aime2024_reader_cfg = dict(
    input_columns=['question'],
    output_column='answer'
 )
 aime2024_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 aime2024_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator)
 )
 aime2024_datasets = [
    dict(
        abbr='aime2024',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        reader_cfg=aime2024_reader_cfg,
        infer_cfg=aime2024_infer_cfg,
        eval_cfg=aime2024_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
@ -0,0 +1,90 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import Aime2024Dataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
 aime2024_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
                ),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 aime2024_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=Aime2024Dataset,
            path='opencompass/aime2024',
            reader_cfg=aime2024_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    )
 )
 aime2024_datasets = [
    dict(
        abbr='aime2024',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        reader_cfg=aime2024_reader_cfg,
        infer_cfg=aime2024_infer_cfg,
        eval_cfg=aime2024_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
@ -0,0 +1,96 @@
 # CoT: No CoT
 # K-Shot: 0-Shot
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 aime2024_reader_cfg = dict(
    input_columns=['question'], 
    output_column='answer'
 )
 aime2024_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 aime2024_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
            ],
                round=[
                dict(
                    role='HUMAN',
                    prompt = GRADER_TEMPLATE
                ),
            ]),
        ),
        dataset_cfg=dict(
            type=Aime2024Dataset,
            path='opencompass/aime2024',
            reader_cfg=aime2024_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 aime2024_datasets = [
    dict(
        abbr=f'aime2024-run{idx}',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        reader_cfg=aime2024_reader_cfg,
        infer_cfg=aime2024_infer_cfg,
        eval_cfg=aime2024_eval_cfg,
        mode='singlescore',
    )
    for idx in range(16)
 ]
--- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
@ -0,0 +1,96 @@
 # CoT: No CoT
 # K-Shot: 0-Shot
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 aime2024_reader_cfg = dict(
    input_columns=['question'], 
    output_column='answer'
 )
 aime2024_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 aime2024_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
            begin=[
                dict(
                    role='SYSTEM',
                    fallback_role='HUMAN',
                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
            ],
                round=[
                dict(
                    role='HUMAN',
                    prompt = GRADER_TEMPLATE
                ),
            ]),
        ),
        dataset_cfg=dict(
            type=Aime2024Dataset,
            path='opencompass/aime2024',
            reader_cfg=aime2024_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 aime2024_datasets = [
    dict(
        abbr=f'aime2024-run{idx}',
        type=Aime2024Dataset,
        path='opencompass/aime2024',
        reader_cfg=aime2024_reader_cfg,
        infer_cfg=aime2024_infer_cfg,
        eval_cfg=aime2024_eval_cfg,
        mode='singlescore',
    )
    for idx in range(8)
 ]
--- a/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
@ -0,0 +1,90 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import CustomDataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
 aime2025_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
                ),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 aime2025_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=CustomDataset,
            path='opencompass/aime2025',
            reader_cfg=aime2025_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
 )
 aime2025_datasets = [
    dict(
        type=CustomDataset,
        abbr='aime2025',
        path='opencompass/aime2025',
        reader_cfg=aime2025_reader_cfg,
        infer_cfg=aime2025_infer_cfg,
        eval_cfg=aime2025_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/bbeh/README.md
+++ b/opencompass/configs/datasets/bbeh/README.md
@ -0,0 +1,26 @@
 # BB#H
 ```bash
 python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
 python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
 ```
 ## Models
 |                   model                    | score |
 |:-----------------------------------------:|------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
 ### Details
 |                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
 |:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
 |                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
 |:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
 |                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
 |:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
 | Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
--- a/opencompass/configs/datasets/bbeh/bbeh_gen.py
+++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py
@ -0,0 +1,93 @@
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
 bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbeh_multiple_choice_sets = [
    'bbeh_boolean_expressions',
    'bbeh_disambiguation_qa',
    'bbeh_geometric_shapes',
    'bbeh_hyperbaton',
    'bbeh_movie_recommendation',
    'bbeh_nycc',
    'bbeh_shuffled_objects',
 ]
 bbeh_free_form_sets = [
    'bbeh_boardgame_qa',
    'bbeh_buggy_tables',
    'bbeh_causal_understanding',
    'bbeh_dyck_languages',
    'bbeh_linguini',
    'bbeh_multistep_arithmetic',
    'bbeh_object_counting',
    'bbeh_object_properties',
    'bbeh_sarc_triples',
    'bbeh_spatial_reasoning',
    'bbeh_sportqa',
    'bbeh_temporal_sequence',
    'bbeh_time_arithmetic',
    'bbeh_web_of_lies',
    'bbeh_word_sorting',
    'bbeh_zebra_puzzles',
 ]
 bbeh_datasets = []
 for _name in bbeh_multiple_choice_sets:
    bbeh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=8192))
    bbeh_eval_cfg = dict(
        evaluator=dict(type=BBEHEvaluator_mcq),
        pred_role='BOT',
        pred_postprocessor=dict(type=bbeh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
    bbeh_datasets.append(
        dict(
            type=BBEHDataset,
            path='opencompass/bbeh',
            name=_name,
            abbr=_name,
            reader_cfg=bbeh_reader_cfg,
            infer_cfg=bbeh_infer_cfg.copy(),
            eval_cfg=bbeh_eval_cfg.copy()))
 for _name in bbeh_free_form_sets:
    bbeh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=8192))
    bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
    bbeh_datasets.append(
        dict(
            type=BBEHDataset,
            path='opencompass/bbeh',
            name=_name,
            abbr=_name,
            reader_cfg=bbeh_reader_cfg,
            infer_cfg=bbeh_infer_cfg.copy(),
            eval_cfg=bbeh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
+++ b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
@ -0,0 +1,126 @@
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (
    BBEHDataset,
    generic_llmjudge_postprocess,
 )
 from opencompass.evaluator import GenericLLMEvaluator
 bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbeh_multiple_choice_sets = [
    'bbeh_boolean_expressions',
    'bbeh_disambiguation_qa',
    'bbeh_geometric_shapes',
    'bbeh_hyperbaton',
    'bbeh_movie_recommendation',
    'bbeh_nycc',
    'bbeh_shuffled_objects',
 ]
 bbeh_free_form_sets = [
    'bbeh_boardgame_qa',
    'bbeh_buggy_tables',
    'bbeh_causal_understanding',
    'bbeh_dyck_languages',
    'bbeh_linguini',
    'bbeh_multistep_arithmetic',
    'bbeh_object_counting',
    'bbeh_object_properties',
    'bbeh_sarc_triples',
    'bbeh_spatial_reasoning',
    'bbeh_sportqa',
    'bbeh_temporal_sequence',
    'bbeh_time_arithmetic',
    'bbeh_web_of_lies',
    'bbeh_word_sorting',
    'bbeh_zebra_puzzles',
 ]
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 bbeh_datasets = []
 for _name in bbeh_multiple_choice_sets + bbeh_free_form_sets:
    bbeh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: ",
                    )
                ]
            ),
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    bbeh_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                        )
                    ],
                    round=[
                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                    ],
                ),
            ),
            dataset_cfg=dict(
                type=BBEHDataset,
                path='opencompass/bbeh',
                name=_name,
                abbr=_name,
                reader_cfg=bbeh_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    bbeh_datasets.append(
        dict(
            type=BBEHDataset,
            path='opencompass/bbeh',
            name=_name,
            abbr=_name,
            reader_cfg=bbeh_reader_cfg,
            infer_cfg=bbeh_infer_cfg,
            eval_cfg=bbeh_eval_cfg,
        )
    )
--- a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
@ -0,0 +1,189 @@
 # flake8: noqa
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import BBHDataset
 from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
 bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbh_multiple_choice_sets = [
    'temporal_sequences',
    'disambiguation_qa',
    'date_understanding',
    'tracking_shuffled_objects_three_objects',
    'penguins_in_a_table',
    'geometric_shapes',
    'snarks',
    'ruin_names',
    'tracking_shuffled_objects_seven_objects',
    'tracking_shuffled_objects_five_objects',
    'logical_deduction_three_objects',
    'hyperbaton',
    'logical_deduction_five_objects',
    'logical_deduction_seven_objects',
    'movie_recommendation',
    'salient_translation_error_detection',
    'reasoning_about_colored_objects',
 ]
 bbh_free_form_sets = [
    'multistep_arithmetic_two',
    'navigate',
    'dyck_languages',
    'word_sorting',
    'sports_understanding',
    'boolean_expressions',
    'object_counting',
    'formal_fallacies',
    'causal_judgement',
    'web_of_lies',
 ]
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
 # For zero shot inference in bbh
 bbh_datasets = []
 for _name in bbh_sets:
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                    ],
                    round=[
                        dict(
                            role='HUMAN',
                            prompt=GRADER_TEMPLATE
                        ),
                    ]),
            ),
            dataset_cfg=dict(
                type=BBHDataset,
                name=_name,
                path='opencompass/bbh',
                reader_cfg=bbh_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
        ),
        pred_role='BOT',
    )
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy())
        )
 # For original 3 shot inference in bbh
 bbh_3_shot_datasets = []
 for _name in bbh_sets:
    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
        _hint = f.read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                    ],
                    round=[
                        dict(
                            role='HUMAN',
                            prompt=GRADER_TEMPLATE
                        ),
                    ]),
            ),
            dataset_cfg=dict(
                type=BBHDataset,
                name=_name,
                path='opencompass/bbh',
                reader_cfg=bbh_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
        ),
        pred_role='BOT',
    )
    bbh_3_shot_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, F403
+    from .bbh_gen_ee62e9 import bbh_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
@ -0,0 +1,99 @@
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
 bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbh_multiple_choice_sets = [
    'temporal_sequences',
    'disambiguation_qa',
    'date_understanding',
    'tracking_shuffled_objects_three_objects',
    'penguins_in_a_table',
    'geometric_shapes',
    'snarks',
    'ruin_names',
    'tracking_shuffled_objects_seven_objects',
    'tracking_shuffled_objects_five_objects',
    'logical_deduction_three_objects',
    'hyperbaton',
    'logical_deduction_five_objects',
    'logical_deduction_seven_objects',
    'movie_recommendation',
    'salient_translation_error_detection',
    'reasoning_about_colored_objects',
 ]
 bbh_free_form_sets = [
    'multistep_arithmetic_two',
    'navigate',
    'dyck_languages',
    'word_sorting',
    'sports_understanding',
    'boolean_expressions',
    'object_counting',
    'formal_fallacies',
    'causal_judgement',
    'web_of_lies',
 ]
 bbh_datasets = []
 for _name in bbh_multiple_choice_sets:
    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
        _hint = f.read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
    bbh_eval_cfg = dict(
        evaluator=dict(type=BBHEvaluator_mcq),
        pred_role='BOT',
        pred_postprocessor=dict(type=bbh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
 for _name in bbh_free_form_sets:
    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
        _hint = f.read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
+++ b/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
@ -0,0 +1,189 @@
 # flake8: noqa
 import os
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import BBHDataset
 from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
 bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
 bbh_multiple_choice_sets = [
    'temporal_sequences',
    'disambiguation_qa',
    'date_understanding',
    'tracking_shuffled_objects_three_objects',
    'penguins_in_a_table',
    'geometric_shapes',
    'snarks',
    'ruin_names',
    'tracking_shuffled_objects_seven_objects',
    'tracking_shuffled_objects_five_objects',
    'logical_deduction_three_objects',
    'hyperbaton',
    'logical_deduction_five_objects',
    'logical_deduction_seven_objects',
    'movie_recommendation',
    'salient_translation_error_detection',
    'reasoning_about_colored_objects',
 ]
 bbh_free_form_sets = [
    'multistep_arithmetic_two',
    'navigate',
    'dyck_languages',
    'word_sorting',
    'sports_understanding',
    'boolean_expressions',
    'object_counting',
    'formal_fallacies',
    'causal_judgement',
    'web_of_lies',
 ]
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
 # For zero shot inference in bbh
 bbh_datasets = []
 for _name in bbh_sets:
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
    bbh_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                    ],
                    round=[
                        dict(
                            role='HUMAN',
                            prompt=GRADER_TEMPLATE
                        ),
                    ]),
            ),
            dataset_cfg=dict(
                type=BBHDataset,
                name=_name,
                path='opencompass/bbh',
                reader_cfg=bbh_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
        ),
        pred_role='BOT',
    )
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy())
        )
 # For original 3 shot inference in bbh
 bbh_3_shot_datasets = []
 for _name in bbh_sets:
    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
        _hint = f.read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
    bbh_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                    ],
                    round=[
                        dict(
                            role='HUMAN',
                            prompt=GRADER_TEMPLATE
                        ),
                    ]),
            ),
            dataset_cfg=dict(
                type=BBHDataset,
                name=_name,
                path='opencompass/bbh',
                reader_cfg=bbh_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
        ),
        pred_role='BOT',
    )
    bbh_3_shot_datasets.append(
        dict(
            type=BBHDataset,
            path='opencompass/bbh',
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
    BigCodeBenchDataset,
    BigCodeBenchEvaluator
 )
 bigcodebench_full_reader_cfg = dict(
-        input_columns=['complete_prompt'],
+    input_columns=['complete_prompt'],
-        output_column='test',
+    output_column='test',
 )
-
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
-bigcodebench_full_infer_cfg = dict(
+    type=PromptTemplate,
-    prompt_template=dict(
+    template=dict(
-        type=PromptTemplate,
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
-        template=dict(
+        round=[
-            begin=[dict(role='system',
+            dict(role='HUMAN', prompt='{complete_prompt}'),
-                        fallback_role='HUMAN',
+        ])),
-                        prompt='')],
+                                   retriever=dict(type=ZeroRetriever),
-            round=[
+                                   inferencer=dict(type=GenInferencer,
-               dict(role='HUMAN', prompt='{complete_prompt}'),
+                                                   max_out_len=1024))
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=1024)
 )
 bigcodebench_full_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='full',
    ),
    pred_role='BOT',
 )
 bigcodebench_full_complete_datasets = [
-    dict(
+    dict(abbr='bigcodebench_full_complete',
-        abbr='bigcodebench_full_complete',
+         type=BigCodeBenchDataset,
-        type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
-        path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
-        reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
        release_version='v0.1.2'
    )
 ]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
    BigCodeBenchDataset,
    BigCodeBenchEvaluator
 )
 bigcodebench_full_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
+    input_columns=['instruct_prompt'],
-        output_column='test',
+    output_column='test',
 )
-
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
-bigcodebench_full_infer_cfg = dict(
+    type=PromptTemplate,
-    prompt_template=dict(
+    template=dict(
-        type=PromptTemplate,
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
-        template=dict(
+        round=[
-            begin=[dict(role='system',
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
-                        fallback_role='HUMAN',
+        ])),
-                        prompt='')],
+                                   retriever=dict(type=ZeroRetriever),
-            round=[
+                                   inferencer=dict(type=GenInferencer,
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
+                                                   max_out_len=8192))
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=8192)
 )
 bigcodebench_full_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='full',
    ),
    pred_role='BOT',
 )
 bigcodebench_full_instruct_datasets = [
-    dict(
+    dict(abbr='bigcodebench_full_instruct',
-        abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
-        type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
-        path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
-        reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
        release_version='v0.1.2'
    )
 ]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
@ -0,0 +1,7 @@
 from mmengine.config import read_base
 with read_base():
    from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets
    from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets
 bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), [])
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
@ -0,0 +1,45 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 bigcodebench_hard_reader_cfg = dict(
    input_columns=['complete_prompt'],
    output_column='test',
 )
 bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
    type=PromptTemplate,
    template=dict(
        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
        round=[
            dict(role='HUMAN', prompt='{complete_prompt}'),
        ])),
                                   retriever=dict(type=ZeroRetriever),
                                   inferencer=dict(type=GenInferencer))
 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='complete',
        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
 )
 bigcodebench_hard_complete_datasets = [
    dict(
        abbr='bigcodebench_hard_complete',
        type=BigCodeBenchDataset,
        path='opencompass/bigcodebench',
        reader_cfg=bigcodebench_hard_reader_cfg,
        infer_cfg=bigcodebench_hard_infer_cfg,
        eval_cfg=bigcodebench_hard_eval_cfg,
        release_version='v0.1.2',
        dataset_version='hard',
    )
 ]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
    BigCodeBenchDataset,
    BigCodeBenchEvaluator
 )
 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['complete_prompt'],
+    input_columns=['complete_prompt'],
-        output_column='test',
+    output_column='test',
 )
-
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
-bigcodebench_hard_infer_cfg = dict(
+    type=PromptTemplate,
-    prompt_template=dict(
+    template=dict(
-        type=PromptTemplate,
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
-        template=dict(
+        round=[
-            begin=[dict(role='system',
+            dict(role='HUMAN', prompt='{complete_prompt}'),
-                        fallback_role='HUMAN',
+        ])),
-                        prompt='')],
+                                   retriever=dict(type=ZeroRetriever),
-            round=[
+                                   inferencer=dict(type=GenInferencer,
-               dict(role='HUMAN', prompt='{complete_prompt}'),
+                                                   max_out_len=1024))
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=1024)
 )
 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
+    from .bigcodebench_hard_instruct_gen_c3d5ad import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
    BigCodeBenchDataset,
    BigCodeBenchEvaluator
 )
 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
+    input_columns=['instruct_prompt'],
-        output_column='test',
+    output_column='test',
 )
-
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
-bigcodebench_hard_infer_cfg = dict(
+    type=PromptTemplate,
-    prompt_template=dict(
+    template=dict(
-        type=PromptTemplate,
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
-        template=dict(
+        round=[
-            begin=[dict(role='system',
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
-                        fallback_role='HUMAN',
+        ])),
-                        prompt='')],
+                                   retriever=dict(type=ZeroRetriever),
-            round=[
+                                   inferencer=dict(type=GenInferencer,
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
+                                                   max_out_len=8192))
            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=8192)
 )
 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
@ -0,0 +1,46 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 bigcodebench_hard_reader_cfg = dict(
    input_columns=['instruct_prompt'],
    output_column='test',
 )
 bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
    type=PromptTemplate,
    template=dict(
        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
        round=[
            dict(role='HUMAN', prompt='{instruct_prompt}'),
        ])),
       retriever=dict(type=ZeroRetriever),
       inferencer=dict(type=GenInferencer)
 )
 bigcodebench_hard_eval_cfg = dict(
    evaluator=dict(
        type=BigCodeBenchEvaluator,
        release_version='v0.1.2',
        eval_type='instruct',
        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
        remote_execute_api=
        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
        dataset_version='hard',
    ),
    pred_role='BOT',
 )
 bigcodebench_hard_instruct_datasets = [
    dict(
        abbr='bigcodebench_hard_instruct',
        type=BigCodeBenchDataset,
        path='opencompass/bigcodebench',
        reader_cfg=bigcodebench_hard_reader_cfg,
        infer_cfg=bigcodebench_hard_infer_cfg,
        eval_cfg=bigcodebench_hard_eval_cfg,
        release_version='v0.1.2',
        dataset_version='hard',
    )
 ]
--- a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, F403
+    from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
@ -0,0 +1,185 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import CMMLUDataset
 from opencompass.utils.text_postprocessors import match_answer_pattern
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 cmmlu_subject_mapping = {
    'agronomy': '农学',
    'anatomy': '解剖学',
    'ancient_chinese': '古汉语',
    'arts': '艺术学',
    'astronomy': '天文学',
    'business_ethics': '商业伦理',
    'chinese_civil_service_exam': '中国公务员考试',
    'chinese_driving_rule': '中国驾驶规则',
    'chinese_food_culture': '中国饮食文化',
    'chinese_foreign_policy': '中国外交政策',
    'chinese_history': '中国历史',
    'chinese_literature': '中国文学',
    'chinese_teacher_qualification': '中国教师资格',
    'clinical_knowledge': '临床知识',
    'college_actuarial_science': '大学精算学',
    'college_education': '大学教育学',
    'college_engineering_hydrology': '大学工程水文学',
    'college_law': '大学法律',
    'college_mathematics': '大学数学',
    'college_medical_statistics': '大学医学统计',
    'college_medicine': '大学医学',
    'computer_science': '计算机科学',
    'computer_security': '计算机安全',
    'conceptual_physics': '概念物理学',
    'construction_project_management': '建设工程管理',
    'economics': '经济学',
    'education': '教育学',
    'electrical_engineering': '电气工程',
    'elementary_chinese': '小学语文',
    'elementary_commonsense': '小学常识',
    'elementary_information_and_technology': '小学信息技术',
    'elementary_mathematics': '初等数学',
    'ethnology': '民族学',
    'food_science': '食品科学',
    'genetics': '遗传学',
    'global_facts': '全球事实',
    'high_school_biology': '高中生物',
    'high_school_chemistry': '高中化学',
    'high_school_geography': '高中地理',
    'high_school_mathematics': '高中数学',
    'high_school_physics': '高中物理学',
    'high_school_politics': '高中政治',
    'human_sexuality': '人类性行为',
    'international_law': '国际法学',
    'journalism': '新闻学',
    'jurisprudence': '法理学',
    'legal_and_moral_basis': '法律与道德基础',
    'logical': '逻辑学',
    'machine_learning': '机器学习',
    'management': '管理学',
    'marketing': '市场营销',
    'marxist_theory': '马克思主义理论',
    'modern_chinese': '现代汉语',
    'nutrition': '营养学',
    'philosophy': '哲学',
    'professional_accounting': '专业会计',
    'professional_law': '专业法学',
    'professional_medicine': '专业医学',
    'professional_psychology': '专业心理学',
    'public_relations': '公共关系',
    'security_study': '安全研究',
    'sociology': '社会学',
    'sports_science': '体育学',
    'traditional_chinese_medicine': '中医中药',
    'virology': '病毒学',
    'world_history': '世界历史',
    'world_religions': '世界宗教',
 }
 QUERY_TEMPLATE = """
 你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
 {question}
 A) {A}
 B) {B}
 C) {C}
 D) {D}
 """.strip()
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
 cmmlu_datasets = []
 for _name in cmmlu_all_sets:
    _ch_name = cmmlu_subject_mapping[_name]
    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
    cmmlu_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[
                    dict(role='HUMAN', prompt=prompt_prefix + QUERY_TEMPLATE),
                ],
            ),
        ),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    cmmlu_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                    begin=[
                        dict(
                            role='SYSTEM',
                            fallback_role='HUMAN',
                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                        )
                    ],
                    round=[
                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                    ],
                ),
            ),
            dataset_cfg=dict(
                type=CMMLUDataset,
                path='opencompass/cmmlu',
                name=_name,
                reader_cfg=dict(
                    input_columns=['question', 'A', 'B', 'C', 'D'],
                    output_column='answer',
                    train_split='dev',
                    test_split='test',
                ),
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    cmmlu_datasets.append(
        dict(
            type=CMMLUDataset,
            path='opencompass/cmmlu',
            name=_name,
            abbr=f'cmmlu-{_name}',
            reader_cfg=dict(
                input_columns=['question', 'A', 'B', 'C', 'D'],
                output_column='answer',
                train_split='dev',
                test_split='test',
            ),
            infer_cfg=cmmlu_infer_cfg,
            eval_cfg=cmmlu_eval_cfg,
            mode='singlescore',
        )
    )
 del _name, _ch_name
--- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
@ -0,0 +1,39 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
 cmo_fib_reader_cfg = dict(
    input_columns=['question'], 
    output_column='answer'
 )
 cmo_fib_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\n请一步一步地推理，并将最终答案写入\\boxed{}.'),
            ],
        )
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer)
 )
 cmo_fib_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
 )
 cmo_fib_datasets = [
    dict(
        abbr='cmo_fib',
        type=CMOFibDataset,
        path='opencompass/cmo_fib',
        reader_cfg=cmo_fib_reader_cfg,
        infer_cfg=cmo_fib_infer_cfg,
        eval_cfg=cmo_fib_eval_cfg
    )
 ]
--- a/opencompass/configs/datasets/drop/drop_gen.py
+++ b/opencompass/configs/datasets/drop/drop_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets
+    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
+++ b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .drop_llmjudge_gen_3857b0 import drop_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
+++ b/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
@ -0,0 +1,89 @@
 from mmengine.config import read_base
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import DropOpenAIDataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 with read_base():
    from .drop_examples import drop_examples  # noqa: F401, F403
 drop_reader_cfg = dict(
    input_columns=['prompt'],
    output_column='answers',
    train_split='validation',
    test_split='validation',
 )
 template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: {prompt}\n \n<Original Question End>\n\n
    <Gold Target Begin>: \n{answers}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 drop_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[dict(role='HUMAN', prompt=template)]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 drop_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=DropOpenAIDataset,
            path='data/drop_simple_eval/dev.jsonl',
            reader_cfg=drop_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
    pred_role='BOT',
 )
 drop_datasets = [
    dict(
        abbr='drop',
        type=DropOpenAIDataset,
        path='data/drop_simple_eval/dev.jsonl',
        reader_cfg=drop_reader_cfg,
        infer_cfg=drop_infer_cfg,
        eval_cfg=drop_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/gpqa/gpqa_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .gpqa_0shot_nocot_genericllmeval_gen_772ea0 import gpqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
@ -0,0 +1,37 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
 from opencompass.datasets import MATHEvaluator, math_postprocess_v2
 gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
 gsm8k_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 gsm8k_eval_cfg = dict(
    evaluator=dict(type=MATHEvaluator, version='v2'),
    pred_postprocessor=dict(type=math_postprocess_v2),
    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
 )
 gsm8k_datasets = [
    dict(
        abbr='gsm8k',
        type=GSM8KDataset,
        path='opencompass/gsm8k',
        reader_cfg=gsm8k_reader_cfg,
        infer_cfg=gsm8k_infer_cfg,
        eval_cfg=gsm8k_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .hellaswag_gen_6faab5 import hellaswag_datasets  # noqa: F401, F403
+    from .hellaswag_10shot_gen_e42710 import hellaswag_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
@ -0,0 +1,97 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
 from opencompass.datasets import HellaswagDatasetwithICE
 from opencompass.utils.text_postprocessors import first_option_postprocess
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 hellaswag_reader_cfg = dict(
    input_columns=['ctx', 'A', 'B', 'C', 'D'],
    output_column='label',
    train_split='train',
    test_split='val',
 )
 align_prompt = """Continue the following text without adding any additional information or formatting:
 {ctx}
 A) {A}
 B) {B}
 C) {C}
 D) {D}
 What is the right option?'"""
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: {ctx}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 hellaswag_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(role='HUMAN', prompt=align_prompt),
            ],
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 hellaswag_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                    )
                ],
                round=[
                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                ],
            ),
        ),
        dataset_cfg=dict(
            type=HellaswagDatasetwithICE,
            path='opencompass/hellaswag_ice',
            reader_cfg=hellaswag_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
    ),
 )
 hellaswag_datasets = [
    dict(
        abbr='hellaswag',
        type=HellaswagDatasetwithICE,
        path='opencompass/hellaswag_ice',
        reader_cfg=hellaswag_reader_cfg,
        infer_cfg=hellaswag_infer_cfg,
        eval_cfg=hellaswag_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/humaneval/humaneval_gen.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base
 with read_base():
-    from .humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, F403
+    from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/korbench/korbench_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .korbench_single_0_shot_gen import korbench_0shot_single_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py
+++ b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py
@ -0,0 +1,117 @@
 from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 korbench_0shot_single_datasets = []
 for category in categories:
    # Prompt template
    prompt_template = dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(
                    role='HUMAN',
                    prompt=''
                )
            ],
            round=[
                dict(
                    role='HUMAN',
                    prompt='{prompt}' # f-string
                )
            ]
        )
    )
    # Reader configuration
    reader_cfg = dict(
        input_columns=['prompt'],
        output_column='answer',
    )
    # Inference configuration
    infer_cfg = dict(
        prompt_template=prompt_template,
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=1024),
    )
    # Evaluation configuration
    eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = GRADER_TEMPLATE
                    ),
                ]),
            ),
            dataset_cfg=dict(
                type=korbenchDataset,
                path='opencompass/korbench',
                prompt_mode='0_shot',
                category=category,
                reader_cfg=reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    # Dataset
    korbench_dataset = dict(
        type=korbenchDataset,
        abbr=f'korbench_{category}',
        path='opencompass/korbench',
        prompt_mode='0_shot',
        category=category,
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
        mode='singlescore',
    )
    korbench_0shot_single_datasets.append(korbench_dataset)
--- a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
+++ b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
@ -0,0 +1,115 @@
 from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 korbench_0shot_single_datasets = []
 for category in categories:
    # Prompt template
    prompt_template = dict(
        type=PromptTemplate,
        template=dict(
            begin=[
                dict(
                    role='HUMAN',
                    prompt=''
                )
            ],
            round=[
                dict(
                    role='HUMAN',
                    prompt='{prompt}' # f-string
                )
            ]
        )
    )
    # Reader configuration
    reader_cfg = dict(
        input_columns=['prompt'],
        output_column='answer',
    )
    # Inference configuration
    infer_cfg = dict(
        prompt_template=prompt_template,
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer),
    )
    # Evaluation configuration
    eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                  begin=[
                      dict(
                          role='SYSTEM',
                          fallback_role='HUMAN',
                          prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                  ],
                      round=[
                      dict(
                          role='HUMAN',
                          prompt = GRADER_TEMPLATE
                      ),
                  ]),
            ),
            dataset_cfg=dict(
                type=korbenchDataset,
                path='opencompass/korbench',
                prompt_mode='0_shot',
                category=category,
                reader_cfg=reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
    )
    # Dataset
    korbench_dataset = dict(
        type=korbenchDataset,
        abbr=f'korbench_{category}',
        path='opencompass/korbench',
        prompt_mode='0_shot',
        category=category,
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
        mode='singlescore',
    )
    korbench_0shot_single_datasets.append(korbench_dataset)
--- a/Show More
+++ b/Show More