Merge branch 'main' of https://github.com/domonic18/opencompass

* 'main' of https://github.com/domonic18/opencompass: [Refactor] Refactorize openicl eval task (#1990) [ci] update baseline for kernal change of vllm and lmdeploy (#2011) [Feature] Make dump-eval-details default behavior (#1999) [Fix] OpenICL Math Evaluator Config (#2007) [Feature] Add CascadeEvaluator (#1992) [Dataset] Add MedXpertQA (#2002) [Dataset] Update dingo 1.5.0 (#2008) [CI] fix baseline score (#2000) [Doc] Fix links between zh & en (#2001)
2025-05-30 16:03:24 +08:00 · 2025-04-10 11:11:35 +08:00 · 2025-04-10 11:11:35 +08:00 · 72b7caa575
commit 72b7caa575
parent 7b626eff9e 12213207b6
59 changed files with 2285 additions and 1426 deletions
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@ -24,9 +24,9 @@ models = [
        abbr='lmdeploy-api-test',
        type=OpenAISDK,
        key='EMPTY',
-        openai_api_base='http://0.0.0.0:23333/v1',
-        path='internlm2',
-        tokenizer_path='internlm/internlm2_5-7b-chat',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm3',
+        tokenizer_path='internlm/internlm3-8b-instruct',
        rpm_verbose=True,
        meta_template=api_meta_template,
        query_per_second=128,
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@ -11,18 +11,10 @@ with read_base():
    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
        winogrande_datasets  # noqa: F401, E501
    # read hf models - chat models
-    from opencompass.configs.models.chatglm.hf_glm4_9b import \
-        models as hf_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
        models as hf_deepseek_7b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
-        models as hf_deepseek_67b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
-        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
-        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@ -49,12 +41,6 @@ with read_base():
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
        models as hf_internlm2_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
-        models as hf_internlm2_20b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
-        models as hf_internlm2_base_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
-        models as hf_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@ -65,14 +51,14 @@ with read_base():
        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
+        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
        models as hf_llama2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
        models as hf_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
        models as hf_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
-        models as hf_llama3_70b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@ -15,14 +15,24 @@ with read_base():
        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
-        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
-        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
-        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
+        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
+        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@ -45,6 +55,8 @@ with read_base():
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
+        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@ -57,6 +69,8 @@ with read_base():
        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
+        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@ -83,10 +97,6 @@ with read_base():
        models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
        models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
-        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
        models as \
        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
@ -95,14 +105,19 @@ with read_base():
    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
        models as \
        lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
+        models as \
+        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
        models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
+        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
-        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.phi.hf_phi_4 import \
+        models as hf_phi_4_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@ -142,6 +157,8 @@ with read_base():

    from ...volc import infer as volc_infer  # noqa: F401, E501

+hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
+
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -175,10 +175,11 @@ class TestApibench:
 class TestVolcFullbench:
    """Test cases for chat model."""

-    @pytest.mark.parametrize(
-        'model, dataset',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
-         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
+        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
+        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
+    ] for p2 in dataset_list(p1, 'objective')])
    @pytest.mark.chat_objective
    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                            model, dataset):
@ -245,10 +246,7 @@ class TestCmdCase:
    @pytest.mark.parametrize('model, dataset',
                             [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                              ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
-                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
    def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -260,9 +258,9 @@ class TestCmdCase:
        [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
         ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
@ -280,13 +278,25 @@ class TestCmdCase:

    @pytest.mark.case4
    @pytest.mark.parametrize(
-        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
    def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
        base_score = baseline_scores.get(model).get(dataset)
        result_score = result_scores.get(model).get(dataset)
-        assert_score(model, result_score, base_score, dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
+
+    @pytest.mark.case5
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
+    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)


 def assert_score(model_type, score, baseline, dataset: str = ''):
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -8,20 +8,25 @@ internlm2_5-7b_hf:
    race-middle_accuracy: 91.78
    race-high_accuracy: 90.02

-internlm2-1.8b-hf:
-    demo_gsm8k_accuracy: 15.62
-    race-middle_accuracy: 71.66
-    race-high_accuracy: 66.38
-
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 89.06
+    demo_gsm8k_accuracy: 87.50
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.54

-internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k_accuracy: 31
-    race-middle_accuracy: 81.34
-    race-high_accuracy: 73.96
+internlm3-8b-instruct-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-vllm:
+    demo_gsm8k_accuracy: 81.25
+    race-middle_accuracy: 92.20
+    race-high_accuracy: 89.88

 internlm2_5-7b-chat_hf:
    demo_gsm8k_accuracy: 87.50
@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
    race-high_accuracy: 90.48

 lmdeploy-api-test:
-    gsm8k_accuracy: 68.75
-    race-middle_accuracy: 87.50
+    gsm8k_accuracy: 56.25
+    race-middle_accuracy: 93.75
    race-high_accuracy: 93.75
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
        drop_accuracy: 81.25
        GPQA_diamond_accuracy: 25
        hellaswag_accuracy: 87.5
-        TheoremQA_score: 18.75
+        TheoremQA_score: 12.50
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
        gsm8k_accuracy: 62.50
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 20
+        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
        CompassArena_naive_average: 43
        mtbench101_avg: 7.8
-        wildbench_average: -12.78
+        wildbench_average: -15.56
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_专业能力: 8.00
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 20
+        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
        compassarena_knowledge_naive_average: 55
        compassarena_reason_v2_naive_average: 40
@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
 internlm2_5-7b-chat-turbomind_fullbench:
    objective:
        race-high_accuracy:  93.75
-        ARC-c_accuracy: 93.75
+        ARC-c_accuracy: 87.50
        BoolQ_accuracy: 68.75
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 56.25
-        drop_accuracy: 81.25
+        drop_accuracy: 75
        GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 81.25
-        TheoremQA_score: 6.25
+        hellaswag_accuracy: 87.5
+        TheoremQA_score: 12.5
        musr_average_naive_average: 39.58
-        korbench_single_naive_average: 37.50
-        gsm8k_accuracy: 68.75
-        math_accuracy: 68.75
+        korbench_single_naive_average: 40
+        gsm8k_accuracy: 62.5
+        math_accuracy: 75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
-        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 16.96
+        ds1000_naive_average: 17.86
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 25.00
-        bbh-logical_deduction_seven_objects_score: 50.00
-        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_accuracy: 69.71
-        cmmlu-china-specific_accuracy: 75.83
+        lcb_test_output_pass@1: 18.75
+        bbh-logical_deduction_seven_objects_score: 56.25
+        bbh-multistep_arithmetic_two_score: 75
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 78.33
        mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 0
+        ds1000_Pandas_accuracy: 12.5
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 18.75
+        ds1000_Scipy_accuracy: 25
        ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 6.25
        ds1000_Matplotlib_accuracy: 50.00
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.50
        college_knowledge_naive_average: 87.5
    subjective:
-        alignment_bench_v1_1_总分: 0.70
+        alignment_bench_v1_1_总分: 0.66
        alpaca_eval_total: 0
        arenahard_score: 50
        Followbench_naive_average: 1
-        CompassArena_naive_average: 38
-        mtbench101_avg: 7.80
-        wildbench_average: -4.86
+        CompassArena_naive_average: 40
+        mtbench101_avg: 8
+        wildbench_average: -6.81
        simpleqa_accuracy_given_attempted: 0
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_专业能力: 7.9
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
        alignment_bench_v1_1_综合问答: 0
        alpaca_eval_helpful_base: 0
        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 50
-        compassarena_reason_v2_naive_average: 30
-        compassarena_math_v2_naive_average: 50
-        compassarena_creationv2_zh_naive_average: 25
+        compassarena_knowledge_naive_average: 45
+        compassarena_reason_v2_naive_average: 25
+        compassarena_math_v2_naive_average: 60
+        compassarena_creationv2_zh_naive_average: 35
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25
+        TheoremQA_score: 12.50
        winogrande_accuracy: 75
        gsm8k_accuracy: 37.5
        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 25.00
+        TheoremQA_score: 12.50
        winogrande_accuracy: 87.5
-        gsm8k_accuracy: 62.50
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        gsm8k_accuracy: 56.25
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
        math_accuracy: 18.75
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 31.25
+        dingo_en_192_score: 50.00
        dingo_zh_170_score: 93.75
        mmlu-other_accuracy: 76.92
        cmmlu-china-specific_accuracy: 84.17
        mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 50
+        bbh-logical_deduction_seven_objects_score: 43.75
        bbh-multistep_arithmetic_two_score: 56.25
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5
@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_total: 25.96
        arenahard_score: 17.15
        Followbench_naive_average: 0.81
-        CompassArena_naive_average: 34.61
+        CompassArena_naive_average: 39.49
        FoFo_naive_average: 0.38
        mtbench101_avg: 8.01
        wildbench_average: -10.49
@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 25
-        compassarena_language_naive_average: 52.5
+        alpaca_eval_vicuna: 33.75
+        compassarena_language_naive_average: 58.50
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 19.91
+        compassarena_math_v2_naive_average: 25.95
        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
        longbench_few-shot-learning_score: 51.67
        longbench_synthetic-tasks_score: 66.83
        longbench_code-completion_score: 45.99
+
+
+qwen2.5-7b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 84.99
+        ARC-c_accuracy: 92.2
+        BoolQ_accuracy: 86.7
+        triviaqa_wiki_1shot_score: 53.06
+        nq_open_1shot_score: 17.51
+        mmmlu_lite_naive_average: 54.96
+        IFEval_Prompt-level-strict-accuracy: 71.53
+        drop_accuracy: 80.07
+        bbh_naive_average: 68.81
+        GPQA_diamond_accuracy: 34.34
+        hellaswag_accuracy: 85.42
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.44
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 92.57
+        GaokaoBench_weighted_average: 80.14
+        math_accuracy: 73.58
+        cmo_fib_accuracy: 25
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 77.33
+        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
+        cmmlu_naive_average: 75.97
+        mmlu_naive_average: 76.01
+        mmlu_pro_naive_average: 56.12
+        openai_humaneval_humaneval_pass@1: 83.54
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.29
+        ds1000_naive_average: 18.66
+        lcb_code_generation_pass@1: 39.5
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.68
+        bigcodebench_hard_instruct_pass@1: 16.22
+        bigcodebench_hard_complete_pass@1: 11.49
+        teval_naive_average: 79.72
+        SciCode_sub_accuracy: 10.76
+        qa_dingo_cn_score: 99.01
+        mmlu_accuracy: 76.01
+        mmlu-stem_accuracy: 77.59
+        mmlu-social-science_accuracy: 79.02
+        mmlu-humanities_accuracy: 72.07
+        mmlu-other_accuracy: 74.86
+        cmmlu_accuracy: 75.97
+        cmmlu-stem_accuracy: 73.09
+        cmmlu-social-science_accuracy: 75.95
+        cmmlu-humanities_accuracy: 76.53
+        cmmlu-other_accuracy: 78.79
+        cmmlu-china-specific_accuracy: 73.17
+        mmlu_pro_accuracy: 56.12
+        mmlu_pro_biology_accuracy: 71.41
+        mmlu_pro_business_accuracy: 67.68
+        mmlu_pro_chemistry_accuracy: 54.59
+        mmlu_pro_computer_science_accuracy: 58.29
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 42.41
+        mmlu_pro_health_accuracy: 55.87
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 28.97
+        mmlu_pro_math_accuracy: 73.13
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 58.43
+        mmlu_pro_psychology_accuracy: 63.16
+        mmlu_pro_other_accuracy: 53.57
+        humanevalx-python_pass@1: 50
+        humanevalx-cpp_pass@1: 42.07
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 75
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.18
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 10.43
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 60.65
+        mmmlu_lite_accuracy: 54.96
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.25
+        openai_mmmlu_lite_DE-DE_accuracy: 59.93
+        openai_mmmlu_lite_ES-LA_accuracy: 66.53
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 49.26
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.47
+        openai_mmmlu_lite_JA-JP_accuracy: 61.54
+        openai_mmmlu_lite_KO-KR_accuracy: 60.28
+        openai_mmmlu_lite_PT-BR_accuracy: 55.51
+        openai_mmmlu_lite_SW-KE_accuracy: 36.42
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
+        college_naive_average: 48
+        high_naive_average: 59
+        middle_naive_average: 78
+        primary_naive_average: 85.67
+        arithmetic_naive_average: 75.67
+        mathbench-a (average)_naive_average: 69.27
+        college_knowledge_naive_average: 83.86
+        high_knowledge_naive_average: 80.29
+        middle_knowledge_naive_average: 84.26
+        primary_knowledge_naive_average: 93.16
+        mathbench-t (average)_naive_average: 85.39
+
+
+
+
+internlm2_5-7b-chat-pytorch:
+    objective:
+        race-high_accuracy: 86.39
+        ARC-c_accuracy: 90.51
+        BoolQ_accuracy: 88.01
+        triviaqa_wiki_1shot_score: 64.77
+        nq_open_1shot_score: 22.71
+        mmmlu_lite_naive_average: 45.02
+        IFEval_Prompt-level-strict-accuracy: 56.56
+        drop_accuracy: 75.46
+        bbh_naive_average: 73.34
+        GPQA_diamond_accuracy: 32.83
+        hellaswag_accuracy: 94.81
+        TheoremQA_score: 23.88
+        musr_average_naive_average: 51.31
+        korbench_single_naive_average: 32
+        ARC_Prize_Public_Evaluation_accuracy: 0.01
+        gsm8k_accuracy: 86.96
+        GaokaoBench_weighted_average: 78.05
+        math_accuracy: 60.34
+        cmo_fib_accuracy: 12.98
+        aime2024_accuracy: 3.33
+        Mathbench_naive_average: 64.82
+        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
+        cmmlu_naive_average: 74.24
+        mmlu_naive_average: 70.2
+        mmlu_pro_naive_average: 45.39
+        openai_humaneval_humaneval_pass@1: 70.12
+        sanitized_mbpp_score: 64.59
+        humanevalx_naive_average: 38.78
+        ds1000_naive_average: 14.19
+        lcb_code_generation_pass@1: 16.5
+        lcb_code_execution_pass@1: 33.82
+        lcb_test_output_pass@1: 22.62
+        bigcodebench_hard_instruct_pass@1: 6.08
+        bigcodebench_hard_complete_pass@1: 6.76
+        teval_naive_average: 79.73
+        SciCode_sub_accuracy: 3.47
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 70.2
+        mmlu-stem_accuracy: 67.73
+        mmlu-social-science_accuracy: 75.49
+        mmlu-humanities_accuracy: 68.56
+        mmlu-other_accuracy: 70.58
+        cmmlu_accuracy: 74.24
+        cmmlu-stem_accuracy: 66.7
+        cmmlu-social-science_accuracy: 75.88
+        cmmlu-humanities_accuracy: 77.56
+        cmmlu-other_accuracy: 77.52
+        cmmlu-china-specific_accuracy: 73.46
+        mmlu_pro_accuracy: 45.39
+        mmlu_pro_biology_accuracy: 65.83
+        mmlu_pro_business_accuracy: 51.96
+        mmlu_pro_chemistry_accuracy: 36.84
+        mmlu_pro_computer_science_accuracy: 48.29
+        mmlu_pro_economics_accuracy: 56.16
+        mmlu_pro_engineering_accuracy: 29.1
+        mmlu_pro_health_accuracy: 44.5
+        mmlu_pro_history_accuracy: 42.26
+        mmlu_pro_law_accuracy: 24.98
+        mmlu_pro_math_accuracy: 54.85
+        mmlu_pro_philosophy_accuracy: 39.28
+        mmlu_pro_physics_accuracy: 37.41
+        mmlu_pro_psychology_accuracy: 58.27
+        mmlu_pro_other_accuracy: 45.78
+        humanevalx-python_pass@1: 56.1
+        humanevalx-cpp_pass@1: 20.73
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 59.15
+        humanevalx-js_pass@1: 57.93
+        ds1000_Pandas_accuracy: 8.93
+        ds1000_Numpy_accuracy: 4.09
+        ds1000_Tensorflow_accuracy: 11.11
+        ds1000_Scipy_accuracy: 7.55
+        ds1000_Sklearn_accuracy: 7.83
+        ds1000_Pytorch_accuracy: 8.82
+        ds1000_Matplotlib_accuracy: 50.97
+        mmmlu_lite_accuracy: 45.02
+        openai_mmmlu_lite_AR-XY_accuracy: 18.6
+        openai_mmmlu_lite_BN-BD_accuracy: 27.58
+        openai_mmmlu_lite_DE-DE_accuracy: 51.23
+        openai_mmmlu_lite_ES-LA_accuracy: 56.63
+        openai_mmmlu_lite_FR-FR_accuracy: 58.11
+        openai_mmmlu_lite_HI-IN_accuracy: 33.82
+        openai_mmmlu_lite_ID-ID_accuracy: 50.39
+        openai_mmmlu_lite_IT-IT_accuracy: 50.39
+        openai_mmmlu_lite_JA-JP_accuracy: 50.95
+        openai_mmmlu_lite_KO-KR_accuracy: 45.05
+        openai_mmmlu_lite_PT-BR_accuracy: 57.89
+        openai_mmmlu_lite_SW-KE_accuracy: 32.14
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
+        college_naive_average: 21
+        high_naive_average: 47
+        middle_naive_average: 59.67
+        primary_naive_average: 76
+        arithmetic_naive_average: 62
+        mathbench-a (average)_naive_average: 53.13
+        college_knowledge_naive_average: 68.99
+        high_knowledge_naive_average: 70.06
+        middle_knowledge_naive_average: 78.53
+        primary_knowledge_naive_average: 88.49
+        mathbench-t (average)_naive_average: 76.51
+
+
+qwen2.5-7b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 85.16
+        ARC-c_accuracy: 90.85
+        BoolQ_accuracy: 86.61
+        triviaqa_wiki_1shot_score: 52.96
+        nq_open_1shot_score: 17.62
+        mmmlu_lite_naive_average: 54.7
+        IFEval_Prompt-level-strict-accuracy: 71.35
+        drop_accuracy: 80.23
+        bbh_naive_average: 68.88
+        GPQA_diamond_accuracy: 36.36
+        hellaswag_accuracy: 85.49
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.3
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 91.66
+        GaokaoBench_weighted_average: 80.02
+        math_accuracy: 73.74
+        cmo_fib_accuracy: 26.44
+        aime2024_accuracy: 13.33
+        Mathbench_naive_average: 77.08
+        wikibench-wiki-single_choice_cncircular_perf_4: 34
+        cmmlu_naive_average: 75.9
+        mmlu_naive_average: 76.27
+        mmlu_pro_naive_average: 56.14
+        openai_humaneval_humaneval_pass@1: 84.76
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.17
+        ds1000_naive_average: 18.57
+        lcb_code_generation_pass@1: 38.75
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.45
+        bigcodebench_hard_instruct_pass@1: 16.89
+        bigcodebench_hard_complete_pass@1: 12.16
+        teval_naive_average: 79.46
+        SciCode_sub_accuracy: 10.42
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.27
+        mmlu-stem_accuracy: 77.75
+        mmlu-social-science_accuracy: 78.65
+        mmlu-humanities_accuracy: 73.12
+        mmlu-other_accuracy: 75.05
+        cmmlu_accuracy: 75.9
+        cmmlu-stem_accuracy: 73.41
+        cmmlu-social-science_accuracy: 75.97
+        cmmlu-humanities_accuracy: 76.42
+        cmmlu-other_accuracy: 78.15
+        cmmlu-china-specific_accuracy: 73.27
+        mmlu_pro_accuracy: 56.14
+        mmlu_pro_biology_accuracy: 72.25
+        mmlu_pro_business_accuracy: 66.16
+        mmlu_pro_chemistry_accuracy: 55.65
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 41.38
+        mmlu_pro_health_accuracy: 54.89
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 29.06
+        mmlu_pro_math_accuracy: 73.58
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 60.05
+        mmlu_pro_psychology_accuracy: 61.9
+        mmlu_pro_other_accuracy: 52.6
+        humanevalx-python_pass@1: 51.83
+        humanevalx-cpp_pass@1: 42.68
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 73.78
+        humanevalx-js_pass@1: 72.56
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.64
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 8.7
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 61.29
+        mmmlu_lite_accuracy: 54.7
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.18
+        openai_mmmlu_lite_DE-DE_accuracy: 60
+        openai_mmmlu_lite_ES-LA_accuracy: 66.18
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 48.63
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.26
+        openai_mmmlu_lite_JA-JP_accuracy: 60.7
+        openai_mmmlu_lite_KO-KR_accuracy: 60.63
+        openai_mmmlu_lite_PT-BR_accuracy: 54.46
+        openai_mmmlu_lite_SW-KE_accuracy: 36
+        openai_mmmlu_lite_YO-NG_accuracy: 31.86
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
+        college_naive_average: 48.33
+        high_naive_average: 59.33
+        middle_naive_average: 76.67
+        primary_naive_average: 86.67
+        arithmetic_naive_average: 74.33
+        mathbench-a (average)_naive_average: 69.07
+        college_knowledge_naive_average: 83.54
+        high_knowledge_naive_average: 80.82
+        middle_knowledge_naive_average: 83.79
+        primary_knowledge_naive_average: 92.22
+        mathbench-t (average)_naive_average: 85.1
+
+
+internlm3-8b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 89.22
+        ARC-c_accuracy: 92.54
+        BoolQ_accuracy: 86.45
+        triviaqa_wiki_1shot_score: 60.72
+        nq_open_1shot_score: 20.25
+        mmmlu_lite_naive_average: 41.82
+        IFEval_Prompt-level-strict-accuracy: 77.45
+        drop_accuracy: 83.27
+        bbh_naive_average: 55.22
+        GPQA_diamond_accuracy: 37.88
+        hellaswag_accuracy: 91.28
+        TheoremQA_score: 20.12
+        musr_average_naive_average: 36.86
+        korbench_single_naive_average: 41.2
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 91.28
+        GaokaoBench_weighted_average: 86.59
+        math_accuracy: 76.96
+        cmo_fib_accuracy: 35.1
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 78.96
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
+        cmmlu_naive_average: 83.33
+        mmlu_naive_average: 76.21
+        mmlu_pro_naive_average: 57.96
+        openai_humaneval_humaneval_pass@1: 81.71
+        sanitized_mbpp_score: 69.65
+        humanevalx_naive_average: 40.73
+        ds1000_naive_average: 27.23
+        lcb_code_generation_pass@1: 34.75
+        lcb_code_execution_pass@1: 49.9
+        lcb_test_output_pass@1: 48.19
+        bigcodebench_hard_instruct_pass@1: 13.51
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 11.11
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.21
+        mmlu-stem_accuracy: 77.7
+        mmlu-social-science_accuracy: 80.98
+        mmlu-humanities_accuracy: 70.83
+        mmlu-other_accuracy: 75.01
+        cmmlu_accuracy: 83.33
+        cmmlu-stem_accuracy: 79.66
+        cmmlu-social-science_accuracy: 83.39
+        cmmlu-humanities_accuracy: 84.73
+        cmmlu-other_accuracy: 86.2
+        cmmlu-china-specific_accuracy: 81.77
+        mmlu_pro_accuracy: 57.96
+        mmlu_pro_biology_accuracy: 75.45
+        mmlu_pro_business_accuracy: 64.64
+        mmlu_pro_chemistry_accuracy: 59.81
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 68.6
+        mmlu_pro_engineering_accuracy: 44.79
+        mmlu_pro_health_accuracy: 58.31
+        mmlu_pro_history_accuracy: 49.87
+        mmlu_pro_law_accuracy: 32.43
+        mmlu_pro_math_accuracy: 70.17
+        mmlu_pro_philosophy_accuracy: 46.89
+        mmlu_pro_physics_accuracy: 59.58
+        mmlu_pro_psychology_accuracy: 66.29
+        mmlu_pro_other_accuracy: 54.33
+        humanevalx-python_pass@1: 43.9
+        humanevalx-cpp_pass@1: 20.12
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 65.24
+        ds1000_Pandas_accuracy: 16.49
+        ds1000_Numpy_accuracy: 34.09
+        ds1000_Tensorflow_accuracy: 26.67
+        ds1000_Scipy_accuracy: 17.92
+        ds1000_Sklearn_accuracy: 20.87
+        ds1000_Pytorch_accuracy: 19.12
+        ds1000_Matplotlib_accuracy: 55.48
+        mmmlu_lite_accuracy: 41.82
+        openai_mmmlu_lite_AR-XY_accuracy: 32.56
+        openai_mmmlu_lite_BN-BD_accuracy: 4.56
+        openai_mmmlu_lite_DE-DE_accuracy: 24.91
+        openai_mmmlu_lite_ES-LA_accuracy: 51.09
+        openai_mmmlu_lite_FR-FR_accuracy: 61.68
+        openai_mmmlu_lite_HI-IN_accuracy: 24.98
+        openai_mmmlu_lite_ID-ID_accuracy: 44.56
+        openai_mmmlu_lite_IT-IT_accuracy: 52.35
+        openai_mmmlu_lite_JA-JP_accuracy: 51.02
+        openai_mmmlu_lite_KO-KR_accuracy: 47.93
+        openai_mmmlu_lite_PT-BR_accuracy: 53.89
+        openai_mmmlu_lite_SW-KE_accuracy: 33.47
+        openai_mmmlu_lite_YO-NG_accuracy: 33.47
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
+        college_naive_average: 45.67
+        high_naive_average: 64.67
+        middle_naive_average: 82.33
+        primary_naive_average: 90.33
+        arithmetic_naive_average: 74
+        mathbench-a (average)_naive_average: 71.4
+        college_knowledge_naive_average: 85.28
+        high_knowledge_naive_average: 79.43
+        middle_knowledge_naive_average: 87.9
+        primary_knowledge_naive_average: 93.42
+        mathbench-t (average)_naive_average: 86.51
+
+
+internlm3-8b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 89.02
+        ARC-c_accuracy: 93.56
+        BoolQ_accuracy: 86.67
+        triviaqa_wiki_1shot_score: 60.54
+        nq_open_1shot_score: 20.3
+        mmmlu_lite_naive_average: 42.6
+        IFEval_Prompt-level-strict-accuracy: 79.11
+        drop_accuracy: 83.32
+        bbh_naive_average: 54.76
+        GPQA_diamond_accuracy: 33.84
+        hellaswag_accuracy: 91.31
+        TheoremQA_score: 18
+        musr_average_naive_average: 36.62
+        korbench_single_naive_average: 41.84
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 90.67
+        GaokaoBench_weighted_average: 86.27
+        math_accuracy: 76.68
+        cmo_fib_accuracy: 33.65
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 78.92
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
+        cmmlu_naive_average: 83.11
+        mmlu_naive_average: 76.23
+        mmlu_pro_naive_average: 58.16
+        openai_humaneval_humaneval_pass@1: 82.32
+        sanitized_mbpp_score: 70.04
+        humanevalx_naive_average: 39.76
+        ds1000_naive_average: 27.84
+        lcb_code_generation_pass@1: 34.5
+        lcb_code_execution_pass@1: 48.02
+        lcb_test_output_pass@1: 47.74
+        bigcodebench_hard_instruct_pass@1: 12.84
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 9.38
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.23
+        mmlu-stem_accuracy: 78.08
+        mmlu-social-science_accuracy: 80.31
+        mmlu-humanities_accuracy: 71.38
+        mmlu-other_accuracy: 74.63
+        cmmlu_accuracy: 83.11
+        cmmlu-stem_accuracy: 79.42
+        cmmlu-social-science_accuracy: 83.34
+        cmmlu-humanities_accuracy: 83.95
+        cmmlu-other_accuracy: 86.22
+        cmmlu-china-specific_accuracy: 81.5
+        mmlu_pro_accuracy: 58.16
+        mmlu_pro_biology_accuracy: 74.62
+        mmlu_pro_business_accuracy: 65.02
+        mmlu_pro_chemistry_accuracy: 60.69
+        mmlu_pro_computer_science_accuracy: 61.46
+        mmlu_pro_economics_accuracy: 68.25
+        mmlu_pro_engineering_accuracy: 45.3
+        mmlu_pro_health_accuracy: 60.15
+        mmlu_pro_history_accuracy: 50.66
+        mmlu_pro_law_accuracy: 31.7
+        mmlu_pro_math_accuracy: 70.32
+        mmlu_pro_philosophy_accuracy: 47.7
+        mmlu_pro_physics_accuracy: 59.51
+        mmlu_pro_psychology_accuracy: 65.41
+        mmlu_pro_other_accuracy: 53.46
+        humanevalx-python_pass@1: 42.68
+        humanevalx-cpp_pass@1: 19.51
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 72.56
+        humanevalx-js_pass@1: 64.02
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 35
+        ds1000_Tensorflow_accuracy: 24.44
+        ds1000_Scipy_accuracy: 20.75
+        ds1000_Sklearn_accuracy: 21.74
+        ds1000_Pytorch_accuracy: 22.06
+        ds1000_Matplotlib_accuracy: 56.77
+        mmmlu_lite_accuracy: 42.6
+        openai_mmmlu_lite_AR-XY_accuracy: 32.84
+        openai_mmmlu_lite_BN-BD_accuracy: 10.46
+        openai_mmmlu_lite_DE-DE_accuracy: 24.56
+        openai_mmmlu_lite_ES-LA_accuracy: 50.95
+        openai_mmmlu_lite_FR-FR_accuracy: 61.05
+        openai_mmmlu_lite_HI-IN_accuracy: 30.6
+        openai_mmmlu_lite_ID-ID_accuracy: 45.89
+        openai_mmmlu_lite_IT-IT_accuracy: 51.79
+        openai_mmmlu_lite_JA-JP_accuracy: 51.65
+        openai_mmmlu_lite_KO-KR_accuracy: 48.77
+        openai_mmmlu_lite_PT-BR_accuracy: 52.7
+        openai_mmmlu_lite_SW-KE_accuracy: 32.91
+        openai_mmmlu_lite_YO-NG_accuracy: 32.84
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
+        college_naive_average: 47
+        high_naive_average: 66.67
+        middle_naive_average: 81.67
+        primary_naive_average: 89.33
+        arithmetic_naive_average: 73.67
+        mathbench-a (average)_naive_average: 71.67
+        college_knowledge_naive_average: 82.91
+        high_knowledge_naive_average: 79.86
+        middle_knowledge_naive_average: 88.92
+        primary_knowledge_naive_average: 92.96
+        mathbench-t (average)_naive_average: 86.16
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -1,7 +1,7 @@
 chat:
    glm-4-9b-chat-hf:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
    glm-4-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 90.62
@ -11,11 +11,14 @@ chat:
    deepseek-7b-chat-hf:
        gsm8k_accuracy: 46.88
        race-high_accuracy: 81.25
-    deepseek-moe-16b-chat-hf:
-        gsm8k_accuracy: 50
-        race-high_accuracy: 68.75
+    deepseek-r1-distill-llama-8b-turbomind:
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 81.25
+    deepseek-r1-distill-qwen-1_5b-turbomind:
+        gsm8k_accuracy: 37.5
+        race-high_accuracy: 53.12
    deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 43.75
        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
@ -36,34 +39,40 @@ chat:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
-        gsm8k_accuracy: 46.88
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
+    internlm3-8b-instruct-hf:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 87.5
    internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 87.50
+        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    internlm2-chat-1.8b-turbomind:
        gsm8k_accuracy: 28.12
        race-high_accuracy: 84.38
    internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 84.38
    internlm2-chat-7b-lmdeploy:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 59.38
        race-high_accuracy: 84.38
    internlm2-chat-7b-sft-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 90.62
+    internlm3-8b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 87.5
    internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 84.38
+        gsm8k_accuracy: 59.38
+        race-high_accuracy: 87.50
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 81.25
    llama-3-8b-instruct-hf:
        gsm8k_accuracy: 68.75
@ -72,14 +81,14 @@ chat:
        gsm8k_accuracy: 18.75
        race-high_accuracy: 46.88
    llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 81.25
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 81.25
    llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 71.88
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 84.38
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 75
@ -94,13 +103,10 @@ chat:
        race-high_accuracy: 78.12
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
-        race-high_accuracy: 68.75
+        race-high_accuracy: 65.62
    mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 31.25
-        race-high_accuracy: 75
-    phi-3-mini-4k-instruct-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 21.88
+        race-high_accuracy: 78.12
    qwen2.5-0.5b-instruct-hf:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 46.88
@ -108,10 +114,10 @@ chat:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 28.12
-        race-high_accuracy: 50
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 43.75
    qwen2.5-3b-instruct-turbomind:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 90.62
    qwen1.5-0.5b-chat-hf:
        gsm8k_accuracy: 0
@ -123,11 +129,11 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 90.62
    qwen2-1.5b-instruct-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    qwen2-7b-instruct-turbomind:
        gsm8k_accuracy: 81.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
    qwen1.5-0.5b-chat-vllm:
        gsm8k_accuracy: 3.12
        race-high_accuracy: 53.12
@ -143,11 +149,11 @@ chat:
    yi-1.5-9b-chat-turbomind:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
-    deepseek-v2-lite-chat-hf:
-        gsm8k_accuracy: 46.88
+    deepseek-v2_lite-chat-turbomind:
+        gsm8k_accuracy: 37.5
        race-high_accuracy: 71.88
    gemma2-27b-it-hf:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    internlm2_5-20b-chat-hf:
        gsm8k_accuracy: 84.38
@ -161,6 +167,9 @@ chat:
    mistral-small-instruct-2409-turbomind:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
+    phi-4:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
    qwen2.5-14b-instruct-hf:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
@ -168,40 +177,41 @@ chat:
        gsm8k_accuracy: 68.75
        race-high_accuracy: 93.75
    yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 93.75
-    deepseek-67b-chat-hf:
-        gsm8k_accuracy: 71.88
+    deepseek-67b-chat-turbomind:
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
+    deepseek-r1-distill-qwen-32b-turbomind:
+        gsm8k_accuracy: 25
+        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
        race-high_accuracy: 87.5
-    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 59.38
-        race-high_accuracy: 81.25
    mixtral-large-instruct-2411-turbomind:
-        gsm8k_accuracy: 90.62
+        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 87.5
-        race-high_accuracy: 46.88
+        gsm8k_accuracy: 93.75
+        race-high_accuracy: 50.00
    qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 90.62
+    deepseek-r1-distill-llama-70b-turbomind:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 90.62
    deepseek-v2_5-1210-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 84.38
-    mixtral-8x22b-instruct-v0.1-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 81.25
+    mixtral-8x22b-instruct-v0.1-turbomind:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
+    mixtral-8x22b-instruct-v0.1-vllm:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
 base:
-    glm-4-9b-hf:
-        gsm8k_accuracy: 68.75
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
    glm-4-9b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
@ -210,15 +220,10 @@ base:
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 46.88
        winogrande_accuracy: 71.88
-    deepseek-moe-16b-base-hf:
-        gsm8k_accuracy: 21.88
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 21.88
-        winogrande_accuracy: 65.62
    deepseek-7b-base-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 18.75
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
        gsm8k_accuracy: 21.88
@ -245,16 +250,21 @@ base:
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 65.62
        winogrande_accuracy: 71.88
+    gemma-2-9b-turbomind:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 18.75
+        winogrande_accuracy: 46.88
    gemma-2b-vllm:
        gsm8k_accuracy: 15.62
        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy:
-        winogrande_accuracy:
+        race-high_accuracy: 28.12
+        winogrande_accuracy: 68.75
    gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy:
-        winogrande_accuracy:
+        gsm8k_accuracy: 43.75
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 81.25
    internlm2_5-7b-hf:
        gsm8k_accuracy: 37.5
        GPQA_diamond_accuracy: 25
@ -265,31 +275,26 @@ base:
        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 62.5
        winogrande_accuracy: 78.12
-    internlm2-base-7b-hf:
-        gsm8k_accuracy: 3.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 75
-        winogrande_accuracy: 65.62
    internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 6.25
+        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 71.88
-        winogrande_accuracy: 78.12
-    internlm2_5-7b-turbomind:
-        gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 34.38
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 87.50
-    internlm2-7b-turbomind:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 71.88
-        winogrande_accuracy: 84.38
-    internlm2-base-7b-turbomind:
-        gsm8k_accuracy: 37.50
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
        winogrande_accuracy: 75
+    internlm2_5-7b-turbomind:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 87.5
+    internlm2-7b-turbomind:
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 71.88
+    internlm2-base-7b-turbomind:
+        gsm8k_accuracy: 28.12
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 71.88
+        winogrande_accuracy: 62.50
    llama-2-7b-hf:
        gsm8k_accuracy: 21.88
        GPQA_diamond_accuracy: 21.88
@ -306,15 +311,15 @@ base:
        race-high_accuracy: 65.62
        winogrande_accuracy: 65.62
    llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 78.12
        winogrande_accuracy: 78.12
    llama-3-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 46.88
        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 81.25
    mistral-7b-v0.3-hf:
        gsm8k_accuracy: 31.25
        GPQA_diamond_accuracy: 6.25
@ -326,15 +331,15 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 71.88
    qwen2.5-1.5b-turbomind:
-        gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 12.50
-        race-high_accuracy: 78.12
-        winogrande_accuracy: 68.75
-    qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 75.00
-        GPQA_diamond_accuracy: 25
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 75
        winogrande_accuracy: 71.88
+    qwen2.5-7b-turbomind:
+        gsm8k_accuracy: 71.88
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 75.00
    qwen1.5-moe-a2.7b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 18.75
@ -356,20 +361,20 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 68.75
    qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    qwen2-7b-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75
    qwen1.5-0.5b-vllm:
        gsm8k_accuracy: 9.38
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 56.25
-        winogrande_accuracy: 62.5
+        winogrande_accuracy: 59.38
    yi-1.5-6b-hf:
        gsm8k_accuracy: 62.5
        GPQA_diamond_accuracy: 3.12
@ -384,25 +389,10 @@ base:
        gsm8k_accuracy: 78.12
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
-    deepseek-v2-lite-hf:
-        gsm8k_accuracy: 31.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 59.38
-        winogrande_accuracy: 71.88
-    internlm2-20b-hf:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 68.75
-        winogrande_accuracy: 75
-    internlm2-base-20b-hf:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy: 84.38
        winogrande_accuracy: 65.62
    internlm2-20b-turbomind:
        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 15.62
+        GPQA_diamond_accuracy: 18.75
        race-high_accuracy: 68.75
        winogrande_accuracy: 81.25
    qwen2.5-14b-hf:
@ -420,33 +410,23 @@ base:
        GPQA_diamond_accuracy: 28.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
-    deepseek-67b-base-hf:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 90.62
    deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 84.38
-    llama-3-70b-turbomind:
        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 81.25
+    llama-3-70b-turbomind:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 15.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 84.38
    qwen2.5-72b-turbomind:
        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 34.38
+        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    deepseek-v2-turbomind:
        gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 15.62
+        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
-    llama-3-70b-hf:
-        gsm8k_accuracy: 62.5
-        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -44,7 +44,7 @@ on:
        type: string
        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
  schedule:
-    - cron:  '15 14 * * 0,2'
+    - cron:  '15 14 * * 0,3'

 env:
  HF_DATASETS_OFFLINE: 1
@ -61,6 +61,7 @@ env:
  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
  CONDA_ENV: regression_test
+  export VLLM_WORKER_MULTIPROC_METHOD: spawn

 jobs:
  build-pypi:
@ -92,7 +93,6 @@ jobs:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
-    environment: 'prod'
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
@ -126,7 +126,6 @@ jobs:
    if: ${{!cancelled()}}
    needs: ['build-pypi', 'build-pypi-lmdeploy']
    runs-on: volc_cu12
-    environment: 'prod'
    timeout-minutes: 120 #2hours
    steps:
      - name: Clone repository
@ -190,7 +189,6 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
    runs-on: volc_cu12_daily
-    environment: 'prod'
    timeout-minutes: 180 #3hours
    steps:
      - name: Clone repository
@ -231,7 +229,6 @@ jobs:
      matrix:
        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
    runs-on: volc_cu12_local
-    environment: 'prod'
    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
@ -258,27 +255,33 @@ jobs:
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
+          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run model test - api
        if: matrix.regression_func == 'api'
        run: |
          . ${{env.CONDA_PATH}}/bin/activate
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
-          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
          echo "restful_pid=$!" >> "$GITHUB_ENV"
          sleep 180s
+          env | grep PROXY
+          env | grep proxy
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
          python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -307,7 +310,6 @@ jobs:
      matrix:
        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
    runs-on: volc_cu12
-    environment: 'prod'
    timeout-minutes: 480 #6hours
    steps:
      - name: Clone repository
@ -341,7 +343,6 @@ jobs:
    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
    timeout-minutes: 5
    runs-on: self-hosted
-    environment: 'prod'
    steps:
      - name: notify
        run: |
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -120,4 +120,4 @@ repos:
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -120,4 +120,4 @@ repos:
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/README.md
+++ b/README.md
@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -57,8 +57,9 @@

 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

+- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
+- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -116,6 +116,12 @@
    paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
    configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
    configpath_llmjudge: ''
+- MedXpertQA:
+    name: MedXpertQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2501.18362
+    configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
 - musr:
    name: MuSR
    category: Reasoning
@ -615,8 +621,8 @@
    name: MATH
    category: Math
    paper: https://arxiv.org/pdf/2103.03874
-    configpath: opencompass/configs/datasets/math
-    configpath_llmjudge: ''
+    configpath: opencompass/configs/datasets/math/math_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
 - math500:
    name: MATH500
    category: Math
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -49,7 +49,7 @@ export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1

 Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.

-### ### Using LLM for Evaluation via Configuration Files
+### Using LLM for Evaluation via Configuration Files

 To set up an LLM judge evaluation, you'll need to configure three main components:

@ -264,6 +264,107 @@ Example evaluation output:
 }
 ```

+## CascadeEvaluator
+
+OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
+
+1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
+
+2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
+
+### Configuring CascadeEvaluator
+
+Here's an example of how to configure the CascadeEvaluator:
+
+```python
+# Define a rule-based evaluator
+rule_evaluator = dict(type=MATHEvaluator)
+
+# Define an LLM judge evaluator
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # Can use environment variables to configure the judge model
+)
+
+# Configure cascade evaluator (cascade mode)
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # Cascade mode
+)
+
+# For parallel mode, set parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # Parallel mode
+)
+
+# Use the cascade evaluator in your dataset evaluation config
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### Evaluation Results
+
+The cascade evaluator outputs detailed evaluation statistics including:
+
+- Accuracy of the rule-based evaluation
+- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
+- Final combined accuracy
+
+Example output:
+
+```python
+{
+    'accuracy': 85.0,  # Final accuracy
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # Number of samples correct by rule evaluation
+        'rule_accuracy': 70.0,  # Accuracy of rule evaluation
+        'llm_evaluated': 30,  # Number of samples evaluated by LLM (failed samples in cascade mode)
+        'llm_correct': 15,  # Number of samples correct by LLM evaluation
+        'llm_accuracy': 50.0,  # Accuracy of LLM evaluation
+        'final_correct': 85,  # Total correct samples
+        'final_accuracy': 85.0,  # Final accuracy
+        'parallel_mode': False,  # Whether parallel mode was used
+    },
+    'details': [
+        # Detailed evaluation results for each sample
+    ]
+}
+```
+
+The cascade evaluator is particularly useful for:
+
+1. Scenarios that require balancing evaluation cost and accuracy
+2. Cases where rule-based evaluators are available but might not be comprehensive
+3. Evaluation tasks that need more nuanced judgment for edge cases
+
 ## Complete Example

-For a complete working example, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving using an LLM judge.
+For a complete working example using GenericLLMEvaluator
+, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
+
+For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]

+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@ -35,7 +35,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
-    'mmlu_pro', 'musr'
+    'mmlu_pro', 'musr', 'math500'
 ]


--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@ -57,7 +57,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.
+- `--dump-eval-details`: Default enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。

 Using run mode `-m all` as an example, the overall execution flow is as follows:

--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -263,6 +263,106 @@ GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。
 }
 ```

+## 级联评估器 (CascadeEvaluator)
+
+OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式评估和LLM评估的优势。级联评估器有两种模式：
+
+1. **级联模式（Cascade Mode, parallel=False）**：首先使用规则式评估器评估所有样本，然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖，从而降低评估成本和时间。
+
+2. **并行模式（Parallel Mode, parallel=True）**：使用规则式评估器和LLM评判器同时评估所有样本，如果任何一个评估器认为样本是正确的，则将该样本视为正确。这种方式可以提高评估的宽容度，但可能会导致更高的成本，因为所有样本都需要LLM评估。
+
+### 配置CascadeEvaluator
+
+以下是配置`CascadeEvaluator`的示例：
+
+```python
+# 定义规则式评估器
+rule_evaluator = dict(type=MATHEvaluator)
+
+# 定义LLM评判器
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # 可以使用环境变量配置评判模型
+)
+
+# 配置级联评估器（级联模式）
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # 级联模式
+)
+
+# 如果需要并行模式，可以设置parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # 并行模式
+)
+
+# 在数据集评估配置中使用级联评估器
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### 评估结果
+
+级联评估器会输出详细的评估统计信息，包括：
+
+- 规则评估的准确率
+- LLM评估的准确率（针对规则评估失败的样本）
+- 最终的综合准确率
+
+输出示例：
+
+```python
+{
+    'accuracy': 85.0,  # 最终准确率
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # 规则评估认为正确的样本数
+        'rule_accuracy': 70.0,  # 规则评估的准确率
+        'llm_evaluated': 30,  # LLM评估的样本数（级联模式下为规则评估失败的样本数）
+        'llm_correct': 15,  # LLM评估认为正确的样本数
+        'llm_accuracy': 50.0,  # LLM评估的准确率
+        'final_correct': 85,  # 最终正确的样本数
+        'final_accuracy': 85.0,  # 最终准确率
+        'parallel_mode': False,  # 是否是并行模式
+    },
+    'details': [
+        # 每个样本的详细评估结果
+    ]
+}
+```
+
+级联评估器特别适用于：
+
+1. 需要平衡评估成本和准确性的场景
+2. 有可用的规则式评估器但可能不够完善的情况
+3. 需要对边界情况进行更精确判断的评估任务
+
 ## 完整示例

-有关完整的工作示例，请参考examples目录中的`eval_llm_judge.py`文件，该文件演示了如何使用LLM评判器评估数学问题解决能力。
+如果希望了解通用LLM评判器，请参考examples目录中的`eval_llm_judge.py`文件，该示例展示了如何使用LLM评判器评估数学问题。
+
+如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件，该示例展示了如何使用级联评估器评估数学问题。
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@ -117,6 +117,10 @@ html_js_files = [
    'js/custom.js'
 ]

+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@ -33,7 +33,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
-    'mmlu_pro', 'musr'
+    'mmlu_pro', 'musr', 'math500'
 ]


--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。
+- `--dump-eval-details`: 默认开启，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。如不需要开启，需设置`--dump-eval-details False`。

 以运行模式 `-m all` 为例，整体运行流如下：

--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
@ -0,0 +1,127 @@
+
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+
+with read_base():
+    # Datasets, Summarizer
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+########################## Evaluator  #################################
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+llm_judge_evaluator =   dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        ),
+        judge_cfg=dict(),
+    )
+
+rule_evaluator =dict(type=MATHEvaluator)
+cascade_evaluator = dict(type=CascadeEvaluator,
+                   llm_evaluator=llm_judge_evaluator,
+                   rule_evaluator=rule_evaluator,
+                   parallel=False
+                   )
+########################## #################################
+eval_cfg = dict()
+
+# eval_cfg['evaluator'] = rule_evaluator
+# eval_cfg['evaluator'] = llm_judge_evaluator
+eval_cfg['evaluator'] = cascade_evaluator 
+
+math_datasets = [
+    dict(
+        abbr='math_prm800k_500',
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+
+datasets = math_datasets
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+
+work_dir = 'math_prm800k_500_cascade_evaluator'
--- a/examples/eval_dingo.py
+++ b/examples/eval_dingo.py
@ -1,7 +1,7 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.dingo.dingo_gen import datasets
-    from .models.hf_internlm.hf_internlm_7b import models
+    from opencompass.configs.datasets.dingo.dingo_gen import datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models

 work_dir = './outputs/eval_dingo'
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@ -119,8 +119,11 @@ def parse_args():
    parser.add_argument(
        '--dump-eval-details',
        help='Whether to dump the evaluation details, including the '
-        'correctness of each sample, bpb, etc.',
-        action='store_true',
+        'correctness of each sample, bpb, etc. Defaults to True.',
+        nargs='?',
+        const=True,
+        default=True,
+        type=lambda x: False if x and x.lower() == 'false' else True
    )
    parser.add_argument(
        '--dump-extract-rate',
@ -233,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):

 def main():
    args = parse_args()
-
    if args.num_gpus is not None:
        raise ValueError('The `--num-gpus` argument is deprecated, please use '
                         '`--hf-num-gpus` to describe number of gpus used for '
@ -350,6 +352,9 @@ def main():
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            fill_eval_cfg(cfg, args)
        if args.dump_eval_details:
+            logger.warning('Default to dump eval details, it might take extra'
+                        'space to save all the evaluation details. '
+                        'Set --dump-eval-details False to skip the details dump')
            cfg.eval.runner.task.dump_details = True
        if args.dump_extract_rate:
            cfg.eval.runner.task.cal_extract_rate = True
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
@ -0,0 +1,57 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedXpertQAEvaluator),
+    pred_role='BOT',
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
@ -0,0 +1,104 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedXpertQADataset,
+            path='TsinghuaC3I/MedXpertQA',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
+    ),
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
--- a/opencompass/configs/datasets/math/math_gen.py
+++ b/opencompass/configs/datasets/math/math_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .math_gen_265cce import math_datasets  # noqa: F401, F403
+    from .math_gen_a58d9d import math_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/math/math_gen_a58d9d.py
+++ b/opencompass/configs/datasets/math/math_gen_a58d9d.py
@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
+            dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
+            dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
+            dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
+            dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
--- a/opencompass/configs/datasets/math/math_llm_judge.py
+++ b/opencompass/configs/datasets/math/math_llm_judge.py
@ -1,35 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
-
-QUERY_TEMPLATE = """
-Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
-{problem}
-Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
-""".strip()
-
-math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
-
-math_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-
-        template=dict(round=[
-            dict(role='HUMAN', prompt=QUERY_TEMPLATE),
-        ])),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024))
-
-math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
-
-math_datasets = [
-    dict(
-        type=MATHDataset,
-        abbr='math',
-        path='opencompass/math',
-        reader_cfg=math_reader_cfg,
-        infer_cfg=math_infer_cfg,
-        eval_cfg=math_eval_cfg)
-]
--- a/opencompass/configs/datasets/math/math_llm_judge_gen.py
+++ b/opencompass/configs/datasets/math/math_llm_judge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .math_llm_judge_gen_56606f import math_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py
+++ b/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py
@ -0,0 +1,85 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:")
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
--- a/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
+++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
@ -0,0 +1,22 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='mixtral-8x22b-instruct-v0.1-turbomind',
+        path='mistralai/Mixtral-8x22B-Instruct-v0.1',
+        engine_config=dict(
+            session_len=32768,
+            max_batch_size=16,
+            tp=8,
+            cache_max_entry_count=0.7,
+        ),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+    )
+]
--- a/opencompass/datasets/MedXpertQA.py
+++ b/opencompass/datasets/MedXpertQA.py
@ -0,0 +1,225 @@
+import re
+
+from datasets import Dataset, load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.utils import get_logger
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode):
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(item.get('options', [])) - 1)
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MedXpertQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        dataset = load_dataset(path, 'Text', split='test')
+        # dataset = load_dataset(path, 'Text', split='dev')
+
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode))
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+
+        return dataset
+
+
+class MedXpertQAEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['options'][idx],
+                                 test_set['label'][idx])
+            detail = {'pred': i, 'answer': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+
+    # Clean up unwanted phrases in the prediction
+    for unwanted_phrase in [
+            'I understand',
+            'A through J',
+            'A through E',
+            'A through D',
+    ]:
+        prediction = prediction.replace(unwanted_phrase, '')
+
+    options_num = len(options)
+    options = [chr(65 + i) for i in range(options_num)]
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+    return prediction[0]
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def MedXpertQA_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    # Get the original dataset
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'medical_task': {}, 'body_system': {}, 'question_type': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        medical_task = sample.get('medical_task', 'unknown')
+        body_system = sample.get('body_system', 'unknown')
+        question_type = sample.get('question_type', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('medical_task', medical_task),
+            ('body_system', body_system),
+            ('question_type', question_type),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('medical_task', medical_task),
+                    ('body_system', body_system),
+                    ('question_type', question_type),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('medical_task', medical_task),
+                ('body_system', body_system),
+                ('question_type', question_type),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'question': sample['question'],
+                'options': sample['options'],
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'medical_task': medical_task,
+                'body_system': body_system,
+                'question_type': question_type,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'MedXpertQA-{key}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -93,6 +93,7 @@ from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
+from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
 from .mmlu_cf import *  # noqa: F401, F403
--- a/opencompass/datasets/dingo.py
+++ b/opencompass/datasets/dingo.py
@ -68,7 +68,7 @@ class DingoEvaluator(BaseEvaluator):
                json.dump(d, f, ensure_ascii=False)
                f.write('\n')
        input_data = {
-            'eval_model': 'llm_base',
+            'eval_group': 'llm_base',
            'input_path': file_name,
            'output_path': './outputs/dingo/',
            'save_data': True,
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .arena_hard import arenahard_bradleyterry_postprocess  # noqa: F401, F403
 from .arena_hard import arenahard_postprocess  # noqa: F401, F403
+from .commonbench import commonbench_postprocess
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
 from .compass_arena import \
    compassarena_bradleyterry_postprocess  # noqa: F401, F403
--- a/opencompass/datasets/subjective/commonbench.py
+++ b/opencompass/datasets/subjective/commonbench.py
@ -0,0 +1,56 @@
+# flake8: noqa: E501
+import re
+from collections import defaultdict
+from typing import Optional
+
+from opencompass.registry import DICT_POSTPROCESSORS
+
+from .utils import get_judgeanswer_and_reference
+
+
+def post_process(judgement: str):
+    """Input a string like below:
+
+    xxx[[5]]xxx, and extract the score
+    """
+    judgement = judgement['prediction']
+    pattern = r'\[\[([\d.]+)\]\]'
+    matched_result = re.findall(pattern, judgement)
+    if matched_result:
+        score = float(matched_result[0])
+    else:
+        return None
+    return {'score': score}
+
+
+def get_capability_results(judged_answers, references):
+    capability_ratings = defaultdict(int)
+    capability_counts = defaultdict(int)
+    for ans, ref in zip(judged_answers, references):
+        capability_ratings['total'] += ans['score']
+        capability_counts['total'] += 1
+        capability_ratings[ref['capability']] += ans['score']
+        capability_counts[ref['capability']] += 1
+
+    capability_avg_ratings = defaultdict(float)
+
+    for capability, total_score in capability_ratings.items():
+        s = total_score / capability_counts[capability]
+        s = round(s, 2)
+        capability_avg_ratings[capability] = s
+
+    return capability_avg_ratings
+
+
+@DICT_POSTPROCESSORS.register_module('commenbench')
+def commonbench_postprocess(
+    output: dict,
+    output_path: str,
+    post_process: Optional[callable] = post_process,
+) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        output, output_path, post_process)
+
+    results = get_capability_results(judged_answers, references)
+    results['details'] = output
+    return results
--- a/opencompass/evaluator/init.py
+++ b/opencompass/evaluator/init.py
@ -1 +1,2 @@
+from .cascade_evaluator import CascadeEvaluator  # noqa
 from .generic_llm_evaluator import GenericLLMEvaluator  # noqa
--- a/opencompass/evaluator/cascade_evaluator.py
+++ b/opencompass/evaluator/cascade_evaluator.py
@ -0,0 +1,302 @@
+import os
+from typing import Any, Callable, Dict, List, Optional
+
+import mmengine
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+from opencompass.utils.logging import get_logger
+
+
+@ICL_EVALUATORS.register_module()
+class CascadeEvaluator(BaseEvaluator):
+    """Cascade Evaluator.
+
+    First uses a rule-based method to judge predictions.
+    If a sample is marked as incorrect by the rule-based method,
+    then it uses an LLM judge to re-evaluate it.
+
+    Arguments:
+        llm_evaluator (dict): Configuration for the LLM evaluator.
+        rule_evaluator (Optional[dict]): Configuration for the
+            rule-based evaluator.
+        sample_score_fn (Optional[Callable]): A function to
+            score individual samples. If provided without rule_evaluator,
+            this function will be used directly.
+        parallel (bool): Whether to run in parallel mode.
+    """
+
+    def __init__(
+        self,
+        llm_evaluator: Dict,
+        rule_evaluator: Optional[Dict] = None,
+        sample_score_fn: Optional[Callable] = None,
+        parallel: bool = True,
+    ) -> None:
+        self.logger = get_logger()
+
+        # Initialize the LLM evaluator
+        llm_evaluator_type = llm_evaluator.pop('type')
+        if isinstance(llm_evaluator_type, str):
+            llm_evaluator_type = ICL_EVALUATORS.get(llm_evaluator_type)
+        self.llm_evaluator = llm_evaluator_type(**llm_evaluator)
+
+        # Initialize the rule evaluator if provided
+        self.rule_evaluator = None
+        if rule_evaluator:
+            rule_evaluator_type = rule_evaluator.pop('type')
+            if isinstance(rule_evaluator_type, str):
+                rule_evaluator_type = ICL_EVALUATORS.get(rule_evaluator_type)
+            self.rule_evaluator = rule_evaluator_type(**rule_evaluator)
+
+        self.sample_score_fn = sample_score_fn
+        self.parallel = parallel
+
+        # At least one of rule_evaluator or sample_score_fn must be provided
+        if not self.rule_evaluator and not self.sample_score_fn:
+            raise ValueError(
+                'Either rule_evaluator or sample_score_fn must be provided')
+
+    def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
+        """Score a single sample using sample_score_fn or rule_evaluator.
+
+        Args:
+            prediction: The model's prediction.
+            reference: The ground truth.
+
+        Returns:
+            Dict: A dictionary containing the score and other details.
+        """
+        if self.sample_score_fn:
+            # Use user-provided function to evaluate a single sample
+            result = self.sample_score_fn(prediction, reference)
+            if not isinstance(result, dict):
+                # Ensure result is a dictionary with at least 'correct' field
+                result = {
+                    'correct': bool(result),
+                    'pred': prediction,
+                    'answer': reference,
+                }
+            return result
+        else:
+            # Use rule_evaluator to evaluate a single sample by calling
+            # the score method with single-element lists
+            result = self.rule_evaluator.score([prediction], [reference])
+            if 'details' in result and len(result['details']) > 0:
+                return result['details'][0]
+            else:
+                # Fallback if rule_evaluator doesn't provide detailed results
+                return {
+                    'correct': result.get('accuracy', 0) > 0,
+                    'pred': prediction,
+                    'answer': reference,
+                }
+
+    def _get_llm_correctness(self, llm_detail):
+        """Determine if the LLM judge considers the answer correct.
+
+        Args:
+            llm_detail: The evaluation details from the LLM judge.
+
+        Returns:
+            bool: Whether the answer is correct according to the LLM judge.
+        """
+        if 'prediction' in llm_detail:
+            response = llm_detail['prediction'].strip().upper()
+            return response == 'A' or response.startswith('CORRECT')
+        elif 'correct' in llm_detail:
+            return llm_detail['correct']
+        elif 'score' in llm_detail:
+            return llm_detail['score'] > 0.5
+        return False
+
+    def score(
+        self,
+        predictions: List[str],
+        references: List[str],
+        test_set: Optional[Dataset] = None,
+    ) -> Dict[str, Any]:
+        """Score predictions using cascade or parallel evaluation.
+
+        Args:
+            predictions: List of model predictions.
+            references: List of ground truths.
+            test_set: Huggingface Dataset containing original test samples.
+
+        Returns:
+            Dict: A dictionary containing the scores and details.
+        """
+        self.logger.info(
+            f"Running {'parallel' if self.parallel else 'cascade'} evaluation")
+
+        # Step 1: Evaluate each sample individually using rule-based evaluation
+        details = []
+        failed_predictions = []
+        failed_references = []
+        failed_indices = []
+
+        for i, (pred, ref) in enumerate(zip(predictions, references)):
+            result = self.sample_score(pred, ref)
+            result['evaluation_method'] = 'rule'
+            details.append({'rule_evaluation': result})
+
+            # If the sample failed rule-based evaluation or in parallel
+            # mode, mark it for LLM evaluation
+            if not result.get('correct', False) or self.parallel:
+                failed_predictions.append(pred)
+                failed_references.append(ref)
+                failed_indices.append(i)
+
+        # Calculate initial accuracy based on rule evaluation
+        initial_correct = sum(
+            1 for detail in details
+            if detail['rule_evaluation'].get('correct', False))
+        initial_accuracy = (100 * initial_correct /
+                            len(predictions) if predictions else 0)
+
+        self.logger.info(
+            f'Rule-based evaluation: {initial_correct}/{len(predictions)} '
+            f'correct ({initial_accuracy:.2f}%)')
+
+        eval_mode = ('parallel (all samples)'
+                     if self.parallel else 'cascade (only failed samples)')
+        self.logger.info(f'Samples requiring LLM evaluation ({eval_mode}): '
+                         f'{len(failed_indices)}')
+
+        # Step 2: If there are samples for LLM evaluation
+        if failed_predictions and test_set is not None:
+            self.logger.info(f'Running LLM evaluation in {eval_mode} mode...')
+
+            # Create a subset of the test_set for LLM evaluation
+            failed_subset = test_set.select(failed_indices)
+
+            # Add prediction and reference columns to the dataset
+            failed_subset = failed_subset.add_column('prediction',
+                                                     failed_predictions)
+            failed_subset = failed_subset.add_column('reference',
+                                                     failed_references)
+
+            # Set a custom output path for LLM evaluation
+            original_out_dir = getattr(self.llm_evaluator, '_out_dir', None)
+            self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
+
+            # Check if results already exist to avoid re-evaluation
+            llm_results_path = f'{self.llm_evaluator._out_dir}.json'
+            if os.path.exists(llm_results_path):
+                self.logger.info(
+                    f'Loading existing LLM evaluation results from '
+                    f'{llm_results_path}')
+                llm_results = mmengine.load(llm_results_path)
+
+                # Extract details from loaded results
+                if llm_results.get('details', []):
+                    loaded_details = llm_results['details']
+                else:
+                    loaded_details = llm_results
+
+                # Strictly verify that the loaded results match
+                # the current evaluation needs
+                if len(loaded_details) != len(failed_indices):
+                    error_msg = (
+                        f'Error: Loaded LLM results contain '
+                        f'{len(loaded_details)} samples, but current '
+                        f'evaluation requires {len(failed_indices)} samples. '
+                        f"The cached results at {llm_results_path} don't match"
+                        f'the current evaluation needs. '
+                        f'Please remove the cache file or fix the mismatch.')
+                    self.logger.error(error_msg)
+                    raise ValueError(error_msg)
+
+            else:
+                # Use GenericLLMEvaluator to evaluate samples
+                # unset dataset_cfg for GenericLLMEvaluator to
+                # directly use test_set
+                self.llm_evaluator.dataset_cfg = None
+                llm_results = self.llm_evaluator.score(
+                    predictions=failed_predictions,
+                    references=failed_references,
+                    test_set=failed_subset,
+                )
+
+            # Restore original output directory
+            if original_out_dir:
+                self.llm_evaluator._out_dir = original_out_dir
+
+            if llm_results.get('details', []):
+                llm_details = llm_results['details']
+            else:
+                llm_details = llm_results
+
+            # Initialize counters for accuracy calculation
+            final_correct = initial_correct if not self.parallel else 0
+            llm_correct = 0
+            llm_evaluated = 0
+
+            # Update the details for samples that were evaluated by LLM
+            for i, llm_detail in enumerate(llm_details.values()):
+                original_index = failed_indices[i]
+                # Store original rule-based evaluation result
+                rule_result = details[original_index].copy()
+                rule_correct = rule_result['rule_evaluation'].get(
+                    'correct', False)
+
+                # Add LLM evaluation details
+                details[original_index]['llm_evaluation'] = llm_detail
+
+                # Determine LLM correctness judgment and store it
+                is_correct = self._get_llm_correctness(llm_detail)
+                details[original_index]['llm_evaluation'][
+                    'llm_correct'] = is_correct
+
+                # Count LLM evaluation statistics
+                llm_evaluated += 1
+                if is_correct:
+                    llm_correct += 1
+
+                # Update final_correct counter based on evaluation mode
+                if self.parallel:
+                    # In parallel mode, either rule-based or LLM evaluations
+                    # should be correct
+                    if rule_correct or is_correct:
+                        final_correct += 1
+                else:
+                    # In cascade mode, if rule was incorrect but LLM
+                    # correct, increment
+                    # (rule correct samples are already counted
+                    # in initial_correct)
+                    if not rule_correct and is_correct:
+                        final_correct += 1
+
+            # Calculate final accuracy
+            final_accuracy = (100 * final_correct /
+                              len(predictions) if predictions else 0)
+            llm_accuracy = (100 * llm_correct /
+                            llm_evaluated if llm_evaluated else 0)
+
+            self.logger.info(
+                f'Final evaluation: {final_correct}/{len(predictions)}'
+                f'correct ({final_accuracy:.2f}%)')
+
+            if llm_evaluated > 0:
+                self.logger.info(
+                    f'LLM evaluation: {llm_correct}/{llm_evaluated} '
+                    f'correct ({llm_accuracy:.2f}%)')
+
+            result = {
+                'accuracy': final_accuracy,
+                'cascade_stats': {
+                    'total_samples': len(predictions),
+                    'rule_correct': initial_correct,
+                    'rule_accuracy': initial_accuracy,
+                    'llm_evaluated': llm_evaluated,
+                    'llm_correct': llm_correct,
+                    'llm_accuracy': llm_accuracy,
+                    'final_correct': final_correct,
+                    'final_accuracy': final_accuracy,
+                    'parallel_mode': self.parallel,
+                },
+                'details': details,
+            }
+
+            return result
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@ -3,6 +3,7 @@ import os.path as osp
 from typing import Dict, List, Optional

 import mmengine
+from datasets import Dataset
 from mmengine.config import ConfigDict

 from opencompass.openicl.icl_evaluator import BaseEvaluator
@ -82,10 +83,19 @@ class GenericLLMEvaluator(BaseEvaluator):
        self,
        predictions,
        references: Optional[List] = None,
+        test_set: Optional[Dataset] = None,
    ) -> Dict:
-        """Apply to single-model scoring."""
+        """Apply to single-model scoring.
+
+        Args:
+            predictions: List of model predictions
+            references: List of reference answers
+            test_set: Optional Dataset containing additional
+            context for evaluation
+        """
        assert len(predictions) == len(
            references), 'predictions and references must have the same length'
+
        # -------------- Build Inferencer ----------------
        self.build_inferencer()

@ -93,9 +103,7 @@ class GenericLLMEvaluator(BaseEvaluator):
        predictions = self.pred_postprocess(predictions)

        # For Single Round Dialogue
-        prediction_dict = {}
-        prediction_dict['prediction'] = predictions
-        prediction_dict['obj_gold'] = references
+        prediction_dict = {'prediction': predictions, 'obj_gold': references}

        # ---------------- Build Dataset for LLM Judge -----------------
        if self.dataset_cfg:
@ -109,19 +117,42 @@ class GenericLLMEvaluator(BaseEvaluator):
                dataset.reader.dataset['test'] = dataset.test.add_column(
                    'reference', references)
        else:
-            # build a default dataset just for comparison
+            # Handle test_set in the else branch
            from opencompass.datasets.lmeval import LMEvalDataset

-            input_columns = list(prediction_dict.keys())
-            if references:
-                input_columns.append('reference')
+            if test_set is not None:
+                # If test_set is provided, use it as the base
+                # Ensure necessary columns exist
+                if 'prediction' not in test_set.column_names:
+                    test_set = test_set.add_column('prediction', predictions)
+                if 'reference' not in test_set.column_names:
+                    test_set = test_set.add_column('reference', references)
+
+                # Prepare input_columns and data dictionary
+                input_columns = test_set.column_names
+                data_dict = {
+                    column: test_set[column]
+                    for column in test_set.column_names
+                }
+            else:
+                # Original default dataset building logic
+                input_columns = list(prediction_dict.keys())
+                if references:
+                    input_columns.append('reference')
+                data_dict = prediction_dict.copy()
+                if references:
+                    data_dict['reference'] = references
+
+            # Create LMEvalDataset
            dataset = LMEvalDataset(
-                reader_cfg=dict(input_columns=input_columns,
-                                output_column=None,
-                                train_split='test'),
-                reference=references,
-                **prediction_dict,
+                reader_cfg=dict(
+                    input_columns=input_columns,
+                    output_column=None,
+                    train_split='test',
+                ),
+                **data_dict,
            )
+
        dataset.reader.output_column = 'reference'
        retriever = ZeroRetriever(dataset)
        # ----------------- LLM Judge ----------------
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@ -91,7 +91,8 @@ class BaseEvaluator:
    ):
        # Check if predictions and references have the
        # same length if both are provided
-        if 'predictions' in score_kwargs and 'references' in score_kwargs:
+        if ('predictions' in score_kwargs and 'references' in score_kwargs
+                and score_kwargs['references'] is not None):
            if len(score_kwargs['predictions']) != len(
                    score_kwargs['references']):
                raise ValueError(
--- a/opencompass/openicl/icl_evaluator/math_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/math_evaluator.py
@ -22,26 +22,16 @@ class MATHEvaluator(BaseEvaluator):
        details = []
        for i, j in zip(predictions, references):
            count += 1
+            j_with_env = f'${j}$'
            gold_parsed = parse(
-                j,
+                j_with_env,
                extraction_mode='first_match',
                extraction_config=[
                    LatexExtractionConfig(),
                    ExprExtractionConfig(),
                ],
            )
-            # If parsing result is empty, try adding LaTeX
-            # environment and parse again
-            if len(gold_parsed) == 0:
-                j_with_env = f'${j}$'
-                gold_parsed = parse(
-                    j_with_env,
-                    extraction_mode='first_match',
-                    extraction_config=[
-                        LatexExtractionConfig(),
-                        ExprExtractionConfig(),
-                    ],
-                )
+
            if len(gold_parsed) != 0:
                # We require the answer to be provided in correct
                # latex (no malformed operators)
--- a/opencompass/summarizers/subjective/common_summarizer.py
+++ b/opencompass/summarizers/subjective/common_summarizer.py
@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
            f.write(','.join(new_header) + '\n')
            for line in new_table:
                f.write(','.join(map(str, line)) + '\n')
-            print(t)
            print(output_file)
        return {'qa_bench_' + show_dataset_abbr:json_result}
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -7,7 +7,6 @@ import random
 import statistics
 import sys
 import time
-from collections import Counter
 from inspect import signature
 from typing import List

@ -19,7 +18,7 @@ from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
                                  TEXT_POSTPROCESSORS)
 from opencompass.tasks.base import BaseTask, extract_role_pred
 from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
-                               get_logger, task_abbr_from_cfg)
+                               get_logger)


@TASKS.register_module()
@ -86,6 +85,26 @@ class OpenICLEvalTask(BaseTask):
                self._score()

    def _score(self):
+        # Load and preprocess test data
+        test_set = self._load_and_preprocess_test_data()
+        # Load predictions
+        pred_dicts, pred_strs = self._load_predictions()
+
+        # Process predictions
+        pred_strs = self._process_predictions(pred_strs)
+
+        # Evaluate predictions
+        result = self._evaluate_predictions(
+            pred_strs,
+            test_set,
+            pred_dicts,
+        )
+
+        # Save results
+        self._save_results(result)
+
+    def _load_and_preprocess_test_data(self):
+        """Load test dataset and apply postprocessing if needed."""
        test_set = build_dataset_from_cfg(self.dataset_cfg).test
        # Postprocess dataset if necessary
        if 'dataset_postprocessor' in self.eval_cfg:
@ -100,7 +119,10 @@ class OpenICLEvalTask(BaseTask):

            test_set = test_set.map(postprocess)

-        # Load predictions
+        return test_set
+
+    def _load_predictions(self):
+        """Load model predictions from files."""
        filename = get_infer_output_path(
            self.model_cfg,
            self.dataset_cfg,
@ -110,217 +132,188 @@ class OpenICLEvalTask(BaseTask):
        root, ext = osp.splitext(filename)
        partial_filename = root + '_0' + ext

-        # Get sc_size if use Self-Consistency
-        sc_size = self.eval_cfg.get('sc_size')
-
        if not osp.exists(osp.realpath(filename)) and not osp.exists(
                osp.realpath(partial_filename)):
-            result = {'error': 'No predictions found.'}
+            raise FileNotFoundError(
+                f'Prediction files not found: neither {filename} '
+                f'nor {partial_filename} exists')
+
+        if osp.exists(osp.realpath(filename)):
+            preds = mmengine.load(filename)
+            preds = [preds[str(i)] for i in range(len(preds))]
        else:
-            if osp.exists(osp.realpath(filename)):
-                preds = mmengine.load(filename)
-                preds = [preds[str(i)] for i in range(len(preds))]
+            filename = partial_filename
+            preds = []
+            i = 1
+            while osp.exists(osp.realpath(filename)):
+                sub_preds = mmengine.load(filename)
+                preds.extend(
+                    [sub_preds[str(i)] for i in range(len(sub_preds))])
+                filename = root + f'_{i}' + ext
+                i += 1
+
+        pred_dicts = copy.deepcopy(preds)
+        preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
+
+        pred_strs = preds.pop('prediction', None)
+
+        return pred_dicts, pred_strs
+
+    def _process_predictions(self, pred_strs):
+        """Apply various processing steps to predictions."""
+        # Check if we're dealing with a list of lists (pred_list_flag)
+        pred_list_flag = pred_strs is not None and isinstance(
+            pred_strs[0], list)
+
+        # Extract role predictions if needed
+        if ('pred_role' in self.eval_cfg and 'meta_template' in self.model_cfg
+                and not MODELS.get(self.model_cfg['type']).is_api):
+            # Create a prompt template for role config parsing
+            from opencompass.models.base import LMTemplateParser
+
+            parser = LMTemplateParser(self.model_cfg['meta_template'])
+            role = parser.roles[self.eval_cfg['pred_role']]
+            if pred_list_flag:
+                pred_strs = [[
+                    extract_role_pred(
+                        _pred,
+                        role.get('begin', None),
+                        role.get('end', None),
+                    ) for _pred in pred
+                ] for pred in pred_strs]
            else:
-                filename = partial_filename
-                preds = []
-                i = 1
-                while osp.exists(osp.realpath(filename)):
-                    sub_preds = mmengine.load(filename)
-                    preds.extend(
-                        [sub_preds[str(i)] for i in range(len(sub_preds))])
-                    filename = root + f'_{i}' + ext
-                    i += 1
-            pred_dicts = copy.deepcopy(preds)
-            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
-
-            pred_strs = preds.pop('prediction', None)
-            pred_list_flag = pred_strs is not None and isinstance(
-                pred_strs[0], list)
-            if ('pred_role' in self.eval_cfg
-                    and 'meta_template' in self.model_cfg
-                    and not MODELS.get(self.model_cfg['type']).is_api):
-                # Create a prompt template for role config parsing
-                from opencompass.models.base import LMTemplateParser
-
-                parser = LMTemplateParser(self.model_cfg['meta_template'])
-                role = parser.roles[self.eval_cfg['pred_role']]
-                if sc_size is not None:
-                    assert pred_list_flag, (
-                        'The prediction for Self-Consistency'
-                        'must be list.')
-                if pred_list_flag:
-                    pred_strs = [[
-                        extract_role_pred(
-                            _pred,
-                            role.get('begin', None),
-                            role.get('end', None),
-                        ) for _pred in pred
-                    ] for pred in pred_strs]
-                else:
-                    pred_strs = [
-                        extract_role_pred(
-                            pred,
-                            role.get('begin', None),
-                            role.get('end', None),
-                        ) for pred in pred_strs
-                    ]
-
-            # Postprocess predictions if necessary
-            # Model Specified Postprocessor
-            if 'pred_postprocessor' in self.model_cfg:
-                kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    pred_strs = [[proc(s, **kwargs) for s in preds]
-                                 for preds in pred_strs]
-                else:
-                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
-            # Dataset Specified Postprocessor
-            if 'pred_postprocessor' in self.eval_cfg:
-                kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    pred_strs = [[proc(s, **kwargs) for s in preds]
-                                 for preds in pred_strs]
-                else:
-                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
-
-            model_pred_strs = []
-            if 'model_postprocessor' in self.eval_cfg:
-                references = (test_set[self.output_column]
-                              if self.output_column else None)
-                model_pred_dicts = copy.deepcopy(pred_dicts)
-                for i, pred_dict in enumerate(model_pred_dicts):
-                    pred_dict['reference'] = [references[i]]
-                self.logger.info('Postprocessing model predictions...')
-                kwargs = self.eval_cfg['model_postprocessor']
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    model_pred_strs = [[
-                        proc(model_pred_dict, **kwargs)
-                        for model_pred_dict in model_pred_dicts
-                    ]]
-                else:
-                    model_pred_strs = proc(model_pred_dicts, **kwargs)
-
-            # Get majority voting predictions if use self-consistency
-            if sc_size is not None:
                pred_strs = [
-                    Counter(s).most_common(1)[0][0] for s in pred_strs
+                    extract_role_pred(
+                        pred,
+                        role.get('begin', None),
+                        role.get('end', None),
+                    ) for pred in pred_strs
                ]

-            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
-            # need results dir to save other files
-            out_path = get_infer_output_path(
-                self.model_cfg,
-                self.dataset_cfg,
-                osp.join(self.work_dir, 'results'),
-            )
-            icl_evaluator._out_dir = osp.splitext(out_path)[
-                0]  # strip extension
-
-            preds['predictions'] = pred_strs
-            preds['references'] = (test_set[self.output_column]
-                                   if self.output_column else None)
-            preds['test_set'] = test_set
-            if 'origin_prompt' not in preds:
-                try:
-                    preds['origin_prompt'] = [
-                        None for _ in range(len(pred_strs))
-                    ]
-                except TypeError:
-                    preds['origin_prompt'] = None
-            preds = {
-                k: preds[k]
-                for k in signature(icl_evaluator.score).parameters
-            }
-            k = self.dataset_cfg.get('k', 1)
-            n = self.dataset_cfg.get('n', 1)
-            result = icl_evaluator.evaluate(k, n, copy.deepcopy(test_set),
-                                            **preds)
-
-            # Get model postprocess result
-            model_details = None
-            model_result = None
-            if 'model_postprocessor' in self.eval_cfg:
-                model_preds = copy.deepcopy(preds)
-                model_preds['predictions'] = model_pred_strs
-                model_result = icl_evaluator.evaluate(k, n,
-                                                      copy.deepcopy(test_set),
-                                                      **model_preds)
-                for key in model_result:
-                    if key == 'details':
-                        model_details = model_result[key]
-                        continue
-                    new_key = 'model_postprocess_' + key
-                    result[new_key] = model_result[key]
-
-            if self.dump_details:
-                details = result.get('details', None)
-                # Try to format details is details is not provided by evaluator
-                if details is None:
-                    self.logger.info(
-                        'Details is not give by evaluator, try to format it')
-                    try:
-                        result['details'] = self.format_details(
-                            pred_strs,
-                            model_pred_strs,
-                            test_set[self.output_column],
-                            details,
-                            model_details,
-                            pred_dicts,
-                        )
-                        self.logger.warning(
-                            f"result['details'] : {result['details']}"),
-                        result['type'] = result['details'].pop('type', None)
-                        if self.cal_extract_rate:
-                            # Calculate the extraction success
-                            # rate for prediction
-                            result['extract_rate'] = self.extract_rate(result)
-
-                        if 'PPL' in str(
-                                self.dataset_cfg.infer_cfg.inferencer.type):
-                            result['correct_bpb'], result['incorrect_bpb'] = (
-                                self.calculate_bpb(pred_dicts))
-                    except Exception as e:
-                        self.logger.warning(
-                            f'Skip dumping details due to: {e}.')
+        # Apply postprocessors if configured
+        # Postprocess predictions if necessary
+        # Model Specified Postprocessor
+        if 'pred_postprocessor' in self.model_cfg:
+            kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
+            proc = kwargs.pop('type')
+            if isinstance(proc, str):
+                proc = TEXT_POSTPROCESSORS.get(proc)
+            if pred_list_flag:
+                pred_strs = [[proc(s, **kwargs) for s in preds]
+                             for preds in pred_strs]
            else:
-                result.pop('details', None)
+                pred_strs = [proc(s, **kwargs) for s in pred_strs]

-        if 'error' in result:
-            self.logger.error(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
-            return
-        elif model_result is None:
-            result_wo_details = {
-                i: result[i]
-                for i in result if i != 'details'
-            }
-            self.logger.info(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
+        # Dataset Specified Postprocessor
+        if 'pred_postprocessor' in self.eval_cfg:
+            kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
+            proc = kwargs.pop('type')
+            if isinstance(proc, str):
+                proc = TEXT_POSTPROCESSORS.get(proc)
+            if pred_list_flag:
+                pred_strs = [[proc(s, **kwargs) for s in preds]
+                             for preds in pred_strs]
+            else:
+                pred_strs = [proc(s, **kwargs) for s in pred_strs]
+
+        return pred_strs
+
+    def _evaluate_predictions(
+        self,
+        pred_strs,
+        test_set,
+        pred_dicts,
+    ):
+        """Evaluate predictions using the configured evaluator."""
+        # Get references from test set
+        references = (None if self.output_column is None else
+                      [sample[self.output_column] for sample in test_set])
+        # Build evaluator from config
+        evaluator_cfg = self.eval_cfg.get('evaluator', {})
+        evaluator_type = evaluator_cfg.get('type')
+        if isinstance(evaluator_type, str):
+            evaluator_type = ICL_EVALUATORS.get(evaluator_type)
+
+        # Prepare evaluator inputs
+        evaluator_cfg_copy = copy.deepcopy(evaluator_cfg)
+        evaluator_cfg_copy.pop('type', None)
+        # Initialize evaluator with appropriate parameters
+        sig = signature(evaluator_type)
+        if 'predictions' in sig.parameters and 'references' in sig.parameters:
+            evaluator = evaluator_type(
+                predictions=pred_strs,
+                references=references,
+                **evaluator_cfg_copy,
+            )
        else:
-            result_wo_details = {
-                i: result[i]
-                for i in result if i != 'details'
-            }
-            model_result_wo_details = {
-                i: model_result[i]
-                for i in model_result if i != 'details'
-            }
-            self.logger.info(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
-            self.logger.info(
-                'Model Postprocess Task: ' +
-                f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
+            evaluator = evaluator_type(**evaluator_cfg_copy)

-        # Save result
+        # Set output directory for the evaluator
+        out_path = get_infer_output_path(
+            self.model_cfg,
+            self.dataset_cfg,
+            osp.join(self.work_dir, 'results'),
+        )
+        evaluator._out_dir = osp.splitext(out_path)[0]  # strip extension
+
+        # If preds contains keys that match the score method
+        # parameters, include them
+        if pred_dicts:
+            preds = {
+                k: [pred.get(k) for pred in pred_dicts]
+                for k in pred_dicts[0]
+            }
+        # Add predictions and references if they're expected
+        # by the score method
+        preds['predictions'] = pred_strs
+        preds['references'] = (test_set[self.output_column]
+                               if self.output_column else None)
+        preds['test_set'] = test_set
+        if 'origin_prompt' not in preds:
+            try:
+                preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
+            except TypeError:
+                preds['origin_prompt'] = None
+        preds = {k: preds[k] for k in signature(evaluator.score).parameters}
+        # Call evaluate with the appropriate parameters
+        k = self.dataset_cfg.get('k', 1)
+        n = self.dataset_cfg.get('n', 1)
+        result = evaluator.evaluate(k, n, copy.deepcopy(test_set), **preds)
+
+        # Format details if needed
+        if self.dump_details:
+            # Get detailed results if available
+            details = result.get('details', None)
+            if details is None:
+                self.logger.info(
+                    'Details is not give by evaluator, try to format it')
+                try:
+                    result['details'] = self.format_details(
+                        pred_strs,
+                        references,
+                        details,
+                        pred_dicts,
+                    )
+
+                    # Calculate extraction rate if needed
+                    if self.cal_extract_rate and details is not None:
+                        result['extract_rate'] = self.extract_rate(result)
+
+                    # Calculate BPB if applicable
+                    if pred_dicts and 'BPB' in pred_dicts[0].get(
+                            list(pred_dicts[0].keys())[0], {}):
+                        correct_bpb, incorrect_bpb = self.calculate_bpb(
+                            pred_dicts)
+                        result['correct_bpb'] = correct_bpb
+                        result['incorrect_bpb'] = incorrect_bpb
+                except Exception as e:
+                    self.logger.warning(f'Skip dumping details due to: {e}.')
+        else:
+            result.pop('details', None)
+        return result
+
+    def _save_results(self, result):
+        """Save evaluation results to file."""
        out_path = get_infer_output_path(
            self.model_cfg,
            self.dataset_cfg,
@ -351,10 +344,8 @@ class OpenICLEvalTask(BaseTask):
    def format_details(
        self,
        predictions,
-        model_pred_strs,
        references,
        details,
-        model_details,
        pred_dicts,
    ):
        """This function is responsible for formatting prediction details.
@ -393,20 +384,6 @@ class OpenICLEvalTask(BaseTask):
                result['predictions'] = str(predictions[i])
                result['references'] = str(references[i])
                result['correct'] = str(predictions[i]) == str(references[i])
-            elif details is not None and model_details is not None:
-                assert (
-                    model_pred_strs != []
-                ), 'Model details is not None, but model_pred_strs is empty'
-                self.logger.info(
-                    f"model_details[i]['pred']: {model_details[i]['pred']}")
-                results['type'] = 'GEN'
-                result['prompt'] = origin_prediction['origin_prompt']
-                result['origin_prediction'] = pred_dicts[i]['prediction']
-                result['predictions'] = details[i]['pred']
-                result['model_extract_predictions'] = model_details[i]['pred']
-                result['references'] = details[i]['answer']
-                result['correct'] = details[i]['correct']
-                result['model_extract_correct'] = model_details[i]['correct']
            elif details is not None:
                results['type'] = 'GEN'
                result['prompt'] = origin_prediction['origin_prompt']
--- a/opencompass/utils/init.py
+++ b/opencompass/utils/init.py
@ -10,9 +10,7 @@ from .fileio import *  # noqa
 from .lark import *  # noqa
 from .logging import *  # noqa
 from .menu import *  # noqa
-from .model_postprocessors import *  # noqa
 from .network import *  # noqa
-from .postprocessors import *  # noqa
 from .prompt import *  # noqa
 from .result_station import *  # noqa
 from .text_postprocessors import *  # noqa
--- a/opencompass/utils/model_postprocessors.py
+++ b/opencompass/utils/model_postprocessors.py
@ -1,135 +0,0 @@
-from functools import partial
-from multiprocessing import Pool
-from typing import Union
-
-from tqdm import tqdm
-
-from opencompass.registry import TEXT_POSTPROCESSORS
-
-from .postprocessors.naive import NaiveExtractor, format_input_naive
-from .postprocessors.xfinder.extractor import Extractor
-from .postprocessors.xfinder.xfinder_utils import (DataProcessor,
-                                                   convert_to_xfinder_format)
-
-
-def gen_output_naive(ori_data, extractor):
-    extracted_answers = []
-    for item in tqdm(ori_data):
-        user_input = extractor.prepare_input(item)
-        extracted_answer = extractor.gen_output(user_input)
-        item['extracted_answer'] = extracted_answer
-        extracted_answers.append(extracted_answer)
-
-    return extracted_answers
-
-
-@TEXT_POSTPROCESSORS.register_module('naive')
-def naive_model_postprocess(preds: list,
-                            model_name: str,
-                            custom_instruction: str,
-                            api_url: Union[str, list],
-                            num_processes: int = 8,
-                            **kwargs) -> list:
-    """Postprocess the text extracted by custom model.
-    Args:
-        preds (list): The question, reference answer and model prediction.
-        model_name (str): The name of the model.
-        custom_instruction (str): Custom instruction for the dataset.
-        url (Union[str, list]): The api url of the model.
-
-    Returns:
-        list: The postprocessed answers.
-    """
-
-    def _eval_pred(texts, extractor, num_processes):
-        ori_data = texts
-        extracted_answers = []
-        batched_ori_data = []
-        # Split data into batches
-        num_processes = min(num_processes, len(ori_data))
-        batch_size = len(ori_data) // num_processes
-        for i in range(0, len(ori_data), batch_size):
-            batched_ori_data.append(ori_data[i:i + batch_size])
-        with Pool(num_processes) as p:
-            results = p.map(partial(gen_output_naive, extractor=extractor),
-                            batched_ori_data)
-            for result in results:
-                extracted_answers.extend(result)
-        return extracted_answers
-
-    format_data = format_input_naive(preds)
-    assert api_url is not None, 'Please provide the api url.'
-    extractor = NaiveExtractor(
-        model_name=model_name,
-        custom_instruction=custom_instruction,
-        url=api_url.split(',') if ',' in api_url else api_url)
-    calc_acc_func = partial(_eval_pred,
-                            extractor=extractor,
-                            num_processes=num_processes)
-    extracted_answers = calc_acc_func(format_data)
-    return extracted_answers
-
-
-def gen_output_xfinder(ori_data, extractor):
-    ext_cor_pairs = []
-    extracted_data = []
-    extracted_answers = []
-    for item in tqdm(ori_data):
-        user_input = extractor.prepare_input(item)
-        extracted_answer = extractor.gen_output(user_input)
-        ext_cor_pairs.append([
-            item['key_answer_type'], item['standard_answer_range'],
-            extracted_answer, item['correct_answer']
-        ])
-        item['xfinder_extracted_answer'] = extracted_answer
-        extracted_answers.append(extracted_answer)
-        extracted_data.append(item)
-
-    return extracted_answers, ext_cor_pairs, extracted_data
-
-
-@TEXT_POSTPROCESSORS.register_module('xfinder')
-def xfinder_postprocess(preds: list, question_type: str, model_name: str,
-                        api_url: Union[str, list], **kwargs) -> list:
-    """Postprocess the text extracted by xFinder model.
-    Args:
-        preds (list): The question, reference answer and model prediction.
-        question_type (str): The type of the question.
-        url (Union[str, list]): The api url of the xFinder model.
-
-
-    Returns:
-        list: The postprocessed texts.
-    """
-
-    def _eval_pred(texts, data_processor, extractor, num_processes=8):
-        ori_data = data_processor.read_data(texts)
-        extracted_correct_pairs = []
-        extracted_data = []
-        extracted_answers = []
-        batched_ori_data = []
-        # Split data into batches
-        num_processes = min(num_processes, len(ori_data))
-        batch_size = len(ori_data) // num_processes
-        for i in range(0, len(ori_data), batch_size):
-            batched_ori_data.append(ori_data[i:i + batch_size])
-        with Pool(num_processes) as p:
-            results = p.map(partial(gen_output_xfinder, extractor=extractor),
-                            batched_ori_data)
-        for result in results:
-            extracted_answers += result[0]
-            extracted_correct_pairs += result[1]
-            extracted_data += result[2]
-        return extracted_answers
-
-    format_data = convert_to_xfinder_format(question_type, preds)
-    assert api_url is not None, 'Please provide the api url.'
-    data_processor = DataProcessor()
-    extractor = Extractor(
-        model_name=model_name,
-        url=api_url.split(',') if ',' in api_url else api_url)
-    calc_acc_func = partial(_eval_pred,
-                            data_processor=data_processor,
-                            extractor=extractor)
-    extracted_answers = calc_acc_func(format_data)
-    return extracted_answers
--- a/opencompass/utils/postprocessors/init.py
+++ b/opencompass/utils/postprocessors/init.py
--- a/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
+++ b/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
@ -1,11 +0,0 @@
-OPTION_NAVIE_PROMPT_TEMPLATE = """
-There is a detailed explanation of the final answer you should extract:
-1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
-2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
-""" # noqa
-
-MATH_NAVIE_PROMPT_TEMPLATE = """
-This is a detailed explanation of the final answer you should extract:
-1. The question type is a math question, so the final answer should be a number, set, vector, matrix, interval, expression, function, equation, or inequality and any combination of them.
-2. If the final answer includes additional symbols, such as units, you should exclude them and only extract the pure final answer.
-""" # noqa
--- a/opencompass/utils/postprocessors/naive/README.md
+++ b/opencompass/utils/postprocessors/naive/README.md
@ -1,71 +0,0 @@
-## Short Usage Introduction for Naive Model Postprocessor with Custom Model
-
-<!-- Now OC can use  -->
-
-### Step 1: Deploy an API server using vLLM or LMDeploy
-
-```bash
-lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name llama3-8b-instruct  --server-port 23333 --backend turbomind --tp 1
-```
-
-### Step 2: Add Naive Model Postprocessor to the configuration file
-
-Take GSM8K as an example, you can add the following lines to the configuration file and replace the `api_url` with the correct address of the API server.
-
-```python
-...
-from opencompass.utils.model_postprocessors import navie_model_postprocess
-from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
-
-...
-
-gsm8k_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'),
-    pred_postprocessor=dict(type=math_postprocess_v2),
-    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
-    # Add the following line to use the naive model postprocessor
-    model_postprocessor=dict(
-        type=navie_model_postprocess,
-        custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
-        model_name='llama3-8b-instruct',
-        api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-    )
-...
-
-```
-
-The prompt for extraction can also be customized by changing the `custom_instruction` parameter. Now support two default templates: `MATH_NAVIE_PROMPT_TEMPLATE` for math problems extraction like GSM8K and MATH, and `OPTION_NAVIE_PROMPT_TEMPLATE` for option problems extraction like MMLU. You can also write your own prompt template, like:
-
-```python
-OPTION_NAVIE_PROMPT_TEMPLATE = """
-There is a detailed explanation of the final answer you should extract:
-1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
-2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
-"""
-```
-
-Your prompt should start with `There is a detailed explanation of the final answer you should extract:` and following with your customized instructions.
-
-### Step 3: Run the Evaluation as Usual
-
-Now you can run the evaluation as usual with the configuration file you modified. The evaluation will use the custom model as the post-process model to get the final result. The final result will be the `model_postprocess_accuracy` in the evaluation result, like:
-
-```Markdown
-dataset                                            version    metric                      mode      llama-3-8b-instruct-turbomind
-------------------------------------------------  ---------  --------------------------  ------  -------------------------------
-gsm8k                                              a58960     accuracy                    gen                               73.46
-gsm8k                                              a58960     model_postprocess_accuracy  gen                               78.77
-```
-
-## Experiment Results
-
-We have tested the model postprocess method with different models (Qwen2-72B-Chat, Llama3-8b-Chat) as post-process model on the GSM8K, MMLU datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
-
-```Markdown
-| Dataset | Type            | Config ID              | Regex Postprocess Score | Model Postprocess Score (Llama3-8b-Instruct) | Model Postprocess Score (Qwen2-72B-Chat) |
-| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |----------------------- |
-| gsm8k   | math            | a58960                   | 73.46               | 79.08                  | 78.77                   |
-| mmlu    | option          | 4d595a                   | 67.89               | 65.26                  | 67.94                  |
-```
-
-The `metric` column with `model_postprocess_accuracy` is the final result after the `Naive Model Postprocessor` is applied.
--- a/opencompass/utils/postprocessors/naive/init.py
+++ b/opencompass/utils/postprocessors/naive/init.py
@ -1,2 +0,0 @@
-from .extractor import *  # noqa
-from .PROMPT_TEMPLATE import *  # noqa
--- a/opencompass/utils/postprocessors/naive/extractor.py
+++ b/opencompass/utils/postprocessors/naive/extractor.py
@ -1,121 +0,0 @@
-# Naive model extractor for OpenCompass, modified from xFinder: https://github.com/IAAR-Shanghai/xFinder # noqa
-import json
-import time
-from logging import getLogger
-
-from openai import OpenAI
-
-Meta_Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
-First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
-Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
-Below are some special cases you need to be aware of:
-    (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
-    (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
-    (3) You should only return the precise answer you extract, without processing the answer. Please return only the answer and do not add any additional content.
-
-""" # noqa
-
-
-def format_input_naive(data):
-    format_data = []
-    for item in data:
-        template = {}
-        question = item['origin_prompt'][-1]['prompt']
-        llm_output = item['prediction']
-        correct_answer = item['reference'] if item['reference'] else item[
-            'gold']
-        template['correct_answer'] = correct_answer
-        template['question'] = question
-        template['llm_output'] = llm_output
-
-        format_data.append(template)
-    return format_data
-
-
-class NaiveExtractor:
-
-    def __init__(
-            self,
-            model_name,
-            model_path=None,
-            url=None,
-            temperature=0,
-            max_tokens=3000,
-            api_key='EMPTY',
-            SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.',  # noqa
-            custom_instruction=''):
-        self.model_name = model_name
-        self.SYSTEM = SYSTEM
-        self.model_path = model_path
-        self.url = url
-        self.api_key = api_key
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.custom_instruction = custom_instruction
-        self.logger = getLogger(__name__)
-
-    def prepare_input(self, item):
-        user_input = Meta_Instruction + self.custom_instruction + \
-            "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
-            "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
-            'Key extracted answer: '
-
-        return user_input
-
-    def gen_output(self, query):
-        return self.openai_infer(query)
-
-    def openai_infer(self, query: str, retry=9) -> str:
-        """Perform inference on the OpenAI model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        if isinstance(self.url, list):
-            # Randomly api for better load balancing
-            import random
-            self.url = random.choice(self.url)
-        self.client = OpenAI(
-            api_key=self.api_key,
-            base_url=self.url,
-        )
-        self.retry = retry
-
-        t = time.time()
-        retry = self.retry
-        response = ''
-        while retry > 0:
-            try:
-                chat_response = self.client.chat.completions.create(
-                    model=self.client.models.list().data[0].id
-                    if self.model_name == '' else self.model_name,
-                    messages=[
-                        {
-                            'role': 'system',
-                            'content': self.SYSTEM
-                        },
-                        {
-                            'role': 'user',
-                            'content': query
-                        },
-                    ],
-                    temperature=self.temperature,
-                    max_tokens=self.max_tokens,
-                )
-                js_response = json.loads(chat_response.model_dump_json())
-                response = js_response['choices'][0]['message']['content']
-                break
-            except Exception as e:
-                self.logger.info(f'Error: {e}')
-                self.logger.info(f'{self.url} is down. Retrying...')
-                self.logger.info(f'Time elapsed: {time.time() - t} seconds')
-                time.sleep(6)
-                retry -= 1
-        if retry == 0:
-            response = 'Error: Failed to get response.'
-            self.logger.info(f'{response} after {self.retry} tries.')
-            raise ValueError('The api is down')
-        return response.strip()
--- a/opencompass/utils/postprocessors/xfinder/README.md
+++ b/opencompass/utils/postprocessors/xfinder/README.md
@ -1,194 +0,0 @@
-## Extract Final Answers with Postprocess Models
-
-OpenCompass now support postprocess (extract) prediction answers with postprocess models, to get the true ability level of models. Now, we use [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first postprocess model to extract the final answers from the model outputs.
-
-We support four types of task types now:
-
-1. **math**: for math questions with numerical pr formula answers, like GSM8k, Math, etc.
-2. **alphabet_option**: for alphabet option questions with alphabet answers, like CommonsenseQA, MMLU, etc.
-3. **short_text**: for questions answer type is a short text with selected short text answers.
-
-Here are some examples of the question type supported from the official xFinder repo:
-
-```json
-[
-    {
-        "model_name": "Llama3_8B_Instruct",
-        "dataset": "MetaMathQA",
-        "key_answer_type": "math",
-        "question": "Find the smallest, positive four-digit multiple of $15.$",
-        "llm_output": "The smallest four-digit multiple of $15$ is $\\boxed{1005}.$",
-        "correct_answer": "1005",
-        "standard_answer_range": "a(n) number / set / vector / matrix / interval / expression / function / equation / inequality"
-    },
-    {
-        "model_name": "LLaMA2_7B_Chat",
-        "dataset": "SIQA_text",
-        "key_answer_type": "short_text",
-        "question": "Jordan kept Carson up all night talking because Jordan's girlfriend left them and they were quite upset. How would Carson feel as a result? Answer Choices: drowsy / upset their girlfriend left / energetic",
-        "llm_output": "Based on the information provided in the question, Carson would likely feel upset because Jordan's girlfriend left them and they were quite upset. Therefore, the correct answer is (B) upset their girlfriend left.",
-        "correct_answer": "drowsy",
-        "standard_answer_range": [
-            "drowsy",
-            "upset their girlfriend left",
-            "energetic"
-        ]
-    },
-    {
-        "model_name": "PHI2",
-        "dataset": "CommonsenseQA_alpha",
-        "key_answer_type": "alphabet_option",
-        "question": "Why would you go to an office? Answer Choices: (A) work (B) school building (C) paper (D) city (E) habit",
-        "llm_output": "A) work (B) school building (C) paper (D) city (E) habit\n\nExplanation:\n We are looking for a reason why someone would go to an office. The only answer choice that matches this is option A) work. Therefore, the correct answer is A) work.",
-        "correct_answer": "A",
-        "standard_answer_range": [
-            [
-                "A",
-                "work"
-            ],
-            [
-                "B",
-                "school building"
-            ],
-            [
-                "C",
-                "paper"
-            ],
-            [
-                "D",
-                "city"
-            ],
-            [
-                "E",
-                "habit"
-            ]
-        ]
-    }
-]
-```
-
-## How to Use Model Postprocess in OpenCompass
-
-### Step 1: Deploy the Postprocess Model Server
-
-For now, there are two xFinder models can use, you can download them from Huggingface model hub:
-
-1. **IAAR-Shanghai/xFinder-qwen1505**
-2. **IAAR-Shanghai/xFinder-llama38it**
-
-You can use LMDeploy or vLLM to deploy the xFinder model server, for example, you can use the following command to deploy the xFinder model server with LMDeploy:
-
-```bash
-lmdeploy serve api_server IAAR-Shanghai/xFinder-qwen1505  --model-name xFinder-qwen1505  --server-port 23333 --backend turbomind --tp 1
-```
-
-### Step 2: Set the Postprocess Model Config in the Dataset Configuration
-
-We make the postprocess as a common postprocess function in OpenCompass, so you can use it by setting the `postprocess` parameter in the `predict` function of OpenCompass. It can be used with the default postprocess regularization extract function at the same time. The only thing you need to do is to deploy the postprocess model server and set the `model_postprocessor` to the original `eval_cfg` in the dataset configuration, like the following example:
-
-```python
-from opencompass.utils.model_postprocessors import xfinder_postprocess
-
-...
-
-    model_postprocessor=dict(
-        type=xfinder_postprocess,
-        question_type='math',
-        xfinder_model_name='xFinder-qwen1505',
-        xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-```
-
-Explanation of the parameters:
-
- `question_type`: the type of the question, which can be one of the three types mentioned above.
- `xfinder_model_name`: the name of the model you deploying the model server.
- `xfiner_api_url`: the URL of the model server, you can set multiple URLs with `,` to use multiple model servers, which can accelerate the postprocess speed.
-
-📢：**Please attention following points**:
-
-1. Now only support extract questions with Zero-shot setting.
-2. For alphabet_option problems, the option should be like '\\nA. xxx\\nB. xxx\\nC. xxx\\nD. xxx\\nE. xxx\\n ...' or '\\n(A) xxx\\n(B) xxx\\n(C) xxx\\n(D) xxx\\n(E) xxx\\n ...' format, and the correct answer should be the alphabet of the correct answer, like 'A', 'B', 'C', 'D', 'E'.
-
-For more details about the xFinder model, you can refer to the [xFinder](https://github.com/IAAR-Shanghai/xFinder), and for a complete example, you can refer to the following example, which is the configuration of the GSM8K dataset with the xFinder postprocess model:
-
-```python
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess, Gsm8kEvaluator
-from opencompass.datasets import MATHEvaluator, math_postprocess_v2
-from opencompass.utils.model_postprocessors import xfinder_postprocess
-
-gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
-
-gsm8k_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            round=[
-                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
-            ],
-        ),
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512),
-)
-
-gsm8k_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'),
-    pred_postprocessor=dict(type=math_postprocess_v2),
-    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
-    model_postprocessor=dict(
-        type=xfinder_postprocess,
-        question_type='math',
-        xfinder_model_name='xFinder-qwen1505',
-        xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-    )
-
-gsm8k_datasets = [
-    dict(
-        abbr='gsm8k',
-        type=GSM8KDataset,
-        path='opencompass/gsm8k',
-        reader_cfg=gsm8k_reader_cfg,
-        infer_cfg=gsm8k_infer_cfg,
-        eval_cfg=gsm8k_eval_cfg,
-    )
-]
-```
-
-For evaluation results, `accuracy` is the result using default postprocess, and `model_postprocess_accuracy` is the result using xFinder postprocess, the gap can be wider when the model is not good answering the questions properly.
-
-You can also use the `--dump-eval-details` command to dump the detailed evaluation details to see the model postprocess results from the `results` folder.
-
-## Results Comparison with Different Question Types
-
-We have tested the model postprocess method with XFinder model on the GSM8K, MMLU, Natural Questions (NQ) datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
-
-| Dataset | Type            | Config Name              | Regex Postprocess Score | Model Postprocess Score |
-| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |
-| gsm8k   | math            | gsm8k_xfinder_gen_a58960 | 73.46                   | 78.09                   |
-| nq      | short_text      | nq_xfinder_gen_3dcea1    | 22.33                   | 37.53                   |
-| mmlu    | alphabet_option | mmlu_xfinder_gen_4d595a  | 67.89                   | 67.93                   |
-
-## Citation
-
-```bibtex
-@misc{2023opencompass,
-    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
-    author={OpenCompass Contributors},
-    howpublished = {\url{https://github.com/open-compass/opencompass}},
-    year={2023}
-}
-
-@misc{yu2024xfinderrobustpinpointanswer,
-      title={xFinder: Robust and Pinpoint Answer Extraction for Large Language Models},
-      author={Qingchen Yu and Zifan Zheng and Shichao Song and Zhiyu Li and Feiyu Xiong and Bo Tang and Ding Chen},
-      year={2024},
-      eprint={2405.11874},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2405.11874},
-}
-
-```
--- a/opencompass/utils/postprocessors/xfinder/init.py
+++ b/opencompass/utils/postprocessors/xfinder/init.py
--- a/opencompass/utils/postprocessors/xfinder/extractor.py
+++ b/opencompass/utils/postprocessors/xfinder/extractor.py
@ -1,175 +0,0 @@
-import json
-import time
-from logging import getLogger
-
-import requests
-from openai import OpenAI
-
-from .xfinder_utils import PROMPT_TEMPLATE
-
-Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
-First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
-Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
-Below are some special cases you need to be aware of:
-    (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
-    (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
-
-""" # noqa
-
-
-class Extractor:
-
-    def __init__(
-        self,
-        model_name,
-        model_path=None,
-        url=None,
-        temperature=0,
-        max_tokens=3000,
-        api_key='EMPTY',
-        SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.'  # noqa
-    ):
-        self.model_name = model_name
-        self.PROMPT_TEMPLATE = PROMPT_TEMPLATE[model_name]
-        self.SYSTEM = SYSTEM
-        self.model_path = model_path
-        self.url = url
-        self.api_key = api_key
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.mode = 'API' if self.url is not None else 'Local'
-        self.logger = getLogger(__name__)
-
-        if self.mode == 'Local':
-            from vllm import LLM, SamplingParams
-            self.sampling_params = SamplingParams(temperature=self.temperature,
-                                                  max_tokens=self.max_tokens,
-                                                  stop=[
-                                                      '<|endoftext|>',
-                                                      '<|im_end|>', '<eoa>',
-                                                      '<||>', '<end_of_turn>',
-                                                      '<|eot_id|>'
-                                                  ])
-            self.llm = LLM(model=self.model_path, gpu_memory_utilization=0.5)
-
-    @staticmethod
-    def prepare_input(item):
-        user_input = Instruction + \
-            "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
-            "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
-            'Answer range: ' + item['standard_answer_range'] + '\n\n' + \
-            'Key extracted answer: '
-
-        return user_input
-
-    def gen_output(self, query):
-        if self.mode == 'API':
-            # return self.send_request(query)
-            return self.openai_infer(query)
-        else:
-            return self.offline_infer(query)
-
-    def send_request(self, query: str) -> str:
-        """Send a request to the model's API and return the response.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
-        payload = json.dumps({
-            'prompt':
-            prompt,
-            'temperature':
-            self.temperature,
-            'max_tokens':
-            self.max_tokens,
-            'stop': [
-                '<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
-                '<end_of_turn>', '<|eot_id|>'
-            ],
-        })
-        headers = {'Content-Type': 'application/json'}
-        res = requests.request('POST', self.url, headers=headers, data=payload)
-        res = res.json()['text'][0]
-        res = res.replace(prompt, '')
-        # res = requests.post(self.url, json=payload)
-        # res = res.json()['text']
-        res = res.strip()
-        return res
-
-    def openai_infer(self, query: str, retry=9) -> str:
-        """Perform inference on the OpenAI model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        if isinstance(self.url, list):
-            # Randomly api for better load balancing
-            import random
-            self.url = random.choice(self.url)
-        self.client = OpenAI(
-            api_key=self.api_key,
-            base_url=self.url,
-        )
-        self.retry = retry
-
-        t = time.time()
-        retry = self.retry
-        response = ''
-        while retry > 0:
-            try:
-                chat_response = self.client.chat.completions.create(
-                    model=self.client.models.list().data[0].id
-                    if self.model_name == '' else self.model_name,
-                    messages=[
-                        {
-                            'role': 'system',
-                            'content': self.SYSTEM
-                        },
-                        {
-                            'role': 'user',
-                            'content': query
-                        },
-                    ],
-                    stop=[
-                        '<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
-                        '<end_of_turn>', '<|eot_id|>'
-                    ],
-                    temperature=self.temperature,
-                    max_tokens=self.max_tokens,
-                )
-                js_response = json.loads(chat_response.model_dump_json())
-                response = js_response['choices'][0]['message']['content']
-                break
-            except Exception as e:
-                self.logger.info(f'Error: {e}')
-                self.logger.info(f'{self.url} is down. Retrying...')
-                self.logger.info(f'Time elapsed: {time.time() - t} seconds')
-                time.sleep(6)
-                retry -= 1
-        if retry == 0:
-            response = 'Error: Failed to get response.'
-            self.logger.info(f'{response} after {self.retry} tries.')
-            raise ValueError('The api is down')
-        return response.strip()
-
-    def offline_infer(self, query: str) -> str:
-        """Perform inference on the local xFinder model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
-        res = self.llm.generate(prompt, self.sampling_params)
-        res = res[0]
-        res = res.outputs[0].text.strip()
-        return res
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py
+++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py
@ -1,14 +0,0 @@
-PROMPT_TEMPLATE = {
-    'xFinder-qwen1505':
-    """<|System|>:{system}
-<|User|>:{input}
-<|Bot|>:""",
-    'xFinder-llama38it':
-    """<|start_header_id|>system<|end_header_id|>
-
-{system}<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-""",
-}
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/init.py
+++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/init.py
@ -1,3 +0,0 @@
-from .convert_data import *  # noqa
-from .data_process import *  # noqa
-from .PROMPT_TEMPLATE import *  # noqa
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py
+++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py
@ -1,123 +0,0 @@
-# Convert OpenCompass prediction data to XFinder format
-import copy
-import json
-import re
-
-xfinder_template = {
-    'math': {
-        'model_name':
-        '',
-        'dataset':
-        '',
-        'key_answer_type':
-        'math',
-        'question':
-        '',
-        'llm_output':
-        '',
-        'correct_answer':
-        '',
-        'standard_answer_range':
-        'a(n) number / set / vector / matrix / interval / expression / function / equation / inequality'  # noqa
-    },
-    'alphabet_option': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': 'alphabet_option',
-        'question': '',
-        'llm_output': '.',
-        'correct_answer': '',
-        'standard_answer_range': []
-    },
-    'categorical_label': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': '',
-        'question': '',
-        'llm_output': '',
-        'correct_answer': '',
-        'standard_answer_range': []
-    },
-    'short_text': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': 'short_text',
-        'question': '',
-        'llm_output': '',
-        'correct_answer': '',
-        'standard_answer_range': []
-    }
-}
-
-
-def parse_options(text: str):
-    lines = text.split('\n')
-    parsed_options = []
-    option_pattern = r'^[A-Z]\)|[A-Z]\.|[A-Z]\)|[A-Z]:|\([A-Z]\)'
-    for line in lines:
-        line = line.strip()
-        match = re.match(option_pattern, line)
-        if match:
-            option = ''
-            # 等于第一个属于选项的字符
-            for c in line:
-                if c.isalpha():
-                    option = c
-                    break
-            content_start = match.end() + 1
-            content = line[content_start:].strip()
-            parsed_options.append([option, content])
-
-    return parsed_options
-
-
-def convert_to_xfinder_format(typ, data, model_name='', dataset_name=''):
-    assert typ in xfinder_template.keys(), f'Invalid type {typ}'
-    format_data = []
-    for item in data:
-        template = copy.deepcopy(xfinder_template[typ])
-        question = item['origin_prompt'][-1]['prompt']
-        llm_output = item['prediction']
-        correct_answer = item['reference'] if item['reference'] else item[
-            'gold']
-        template['correct_answer'] = correct_answer
-        template['model_name'] = model_name
-        template['dataset'] = dataset_name
-        template['question'] = question
-        template['llm_output'] = llm_output
-        try:
-            assert typ in list(xfinder_template.keys())
-            if typ == 'alphabet_option':
-                options = parse_options(question)
-                template['standard_answer_range'] = options
-            elif typ == 'short_text':
-                template['standard_answer_range'] = item['gold']
-            elif typ == 'categorical_label':
-                pass
-        except Exception as e:
-            print(f'Error when parsing question options: {e}, skipping...')
-            continue
-
-        format_data.append(template)
-    return format_data
-
-
-if __name__ == '__main__':
-    # Test
-    example_data = {
-        'origin_prompt': [{
-            'role':
-            'HUMAN',
-            'prompt':
-            'Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Ophelia, Bob is dancing with Jamie, Claire is dancing with Melissa, Dave is dancing with Rodrigo, and Eve is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Claire and Bob switch partners. Then, Claire and Eve switch partners. Then, Claire and Bob switch partners. Then, Eve and Dave switch partners. Finally, Claire and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Ophelia\n(B) Jamie\n(C) Melissa\n(D) Rodrigo\n(E) Patrick'  # noqa
-        }],
-        'origin_prediction':
-        '\n 答案: B) 前者小于后者',
-        'prediction':
-        'B',
-        'reference':
-        'A'
-    }
-    example_data = convert_to_xfinder_format('alphabet_option', [example_data],
-                                             'GPT-3', 'OpenAI')
-    print(json.dumps(example_data, indent=4, ensure_ascii=False))
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py
+++ b/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py
@ -1,24 +0,0 @@
-import ast
-
-
-class DataProcessor:
-
-    def __init__(self):
-        pass
-
-    def read_data(self, data):
-        for item in data:
-            if isinstance(item['standard_answer_range'],
-                          str) and item['key_answer_type'] != 'math':
-                try:
-                    item['standard_answer_range'] = ast.literal_eval(
-                        item['standard_answer_range'])
-                except Exception as e:
-                    print(f'Error: {e}')
-                    print('Please check the form of standard_answer_range')
-                    exit(0)
-
-            item['standard_answer_range'] = str(item['standard_answer_range'])
-            item['key_answer_type'] = str(item['key_answer_type'])
-
-        return data
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -4,7 +4,7 @@ alpaca-eval==0.6
 antlr4-python3-runtime==4.11
 cn2an
 # Dingo
-dingo-python==1.1.2
+dingo-python==1.5.0
 # Icl topk retriever
 faiss_gpu==1.7.2
 # Humaneval, Humaneval X