Merge branch 'open-compass:main' into f14-baseinferencer

2025-05-30 16:03:24 +08:00 · 2025-05-21 13:00:32 +02:00 · 2025-05-21 13:00:32 +02:00 · b199be89d7
commit b199be89d7
parent 61b52844be aa2b89b6f8
151 changed files with 6990 additions and 550 deletions
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -9,7 +9,7 @@ internlm2_5-7b_hf:
    race-high_accuracy: 90.02

 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 87.50
+    demo_gsm8k_accuracy: 84.38
    race-middle_accuracy: 92.76
    race-high_accuracy: 90.54

@ -24,7 +24,7 @@ internlm3-8b-instruct_hf-lmdeploy:
    race-high_accuracy: 90.34

 internlm3-8b-instruct_hf-vllm:
-    demo_gsm8k_accuracy: 81.25
+    demo_gsm8k_accuracy: 78.12
    race-middle_accuracy: 92.20
    race-high_accuracy: 89.88

@ -34,6 +34,6 @@ internlm2_5-7b-chat_hf:
    race-high_accuracy: 90.48

 lmdeploy-api-test:
-    gsm8k_accuracy: 56.25
+    gsm8k_accuracy: 68.75
    race-middle_accuracy: 93.75
    race-high_accuracy: 93.75
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
        college_knowledge_naive_average: 87.5
    subjective:
        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 0
-        arenahard_score: 50
+        alpaca_eval_total: 20.00
+        arenahard_score: 56.82
        Followbench_naive_average: 1
        CompassArena_naive_average: 43
-        mtbench101_avg: 7.8
-        wildbench_average: -15.56
-        simpleqa_accuracy_given_attempted: 0
-        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.00
+        mtbench101_avg: 7.60
+        wildbench_average: -14.58
+        simpleqa_accuracy_given_attempted: 1.00
+        chinese_simpleqa_given_attempted_accuracy: 0.90
+        alignment_bench_v1_1_专业能力: 7.90
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench:
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 0
+        alpaca_eval_helpful_base: 20.00
        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 55
+        compassarena_knowledge_naive_average: 60.00
        compassarena_reason_v2_naive_average: 40
-        compassarena_math_v2_naive_average: 55
+        compassarena_math_v2_naive_average: 50.00
        compassarena_creationv2_zh_naive_average: 30
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
@ -73,58 +73,58 @@ internlm2_5-7b-chat-hf_fullbench:
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
-        simpleqa_f1: 0
+        simpleqa_f1: 0.12

 internlm2_5-7b-chat-turbomind_fullbench:
    objective:
        race-high_accuracy:  93.75
-        ARC-c_accuracy: 87.50
-        BoolQ_accuracy: 68.75
+        ARC-c_accuracy: 93.75
+        BoolQ_accuracy: 75.00
        triviaqa_wiki_1shot_score: 50
        nq_open_1shot_score: 25
        IFEval_Prompt-level-strict-accuracy: 56.25
        drop_accuracy: 75
-        GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 87.5
+        GPQA_diamond_accuracy: 37.50
+        hellaswag_accuracy: 81.25
        TheoremQA_score: 12.5
        musr_average_naive_average: 39.58
        korbench_single_naive_average: 40
-        gsm8k_accuracy: 62.5
-        math_accuracy: 75
+        gsm8k_accuracy: 68.75
+        math_accuracy: 68.75
        cmo_fib_accuracy: 6.25
        aime2024_accuracy: 6.25
        wikibench-wiki-single_choice_cncircular_perf_4: 25
        sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 17.86
+        ds1000_naive_average: 15.18
        lcb_code_generation_pass@1: 12.5
        lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 18.75
-        bbh-logical_deduction_seven_objects_score: 56.25
-        bbh-multistep_arithmetic_two_score: 75
-        mmlu-other_accuracy: 72.6
-        cmmlu-china-specific_accuracy: 78.33
-        mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 12.5
+        lcb_test_output_pass@1: 0.00
+        bbh-logical_deduction_seven_objects_score: 62.50
+        bbh-multistep_arithmetic_two_score: 62.50
+        mmlu-other_accuracy: 73.08
+        cmmlu-china-specific_accuracy: 75.42
+        mmlu_pro_math_accuracy: 25.00
+        ds1000_Pandas_accuracy: 0.00
        ds1000_Numpy_accuracy: 0
        ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 25
+        ds1000_Scipy_accuracy: 18.75
        ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 6.25
-        ds1000_Matplotlib_accuracy: 50.00
+        ds1000_Pytorch_accuracy: 12.50
+        ds1000_Matplotlib_accuracy: 43.75
        openai_mmmlu_lite_AR-XY_accuracy: 37.5
        college_naive_average: 12.50
        college_knowledge_naive_average: 87.5
    subjective:
-        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 0
-        arenahard_score: 50
+        alignment_bench_v1_1_总分: 0.72
+        alpaca_eval_total: 20.00
+        arenahard_score: 55.77
        Followbench_naive_average: 1
-        CompassArena_naive_average: 40
-        mtbench101_avg: 8
-        wildbench_average: -6.81
-        simpleqa_accuracy_given_attempted: 0
+        CompassArena_naive_average: 39.00
+        mtbench101_avg: 7.90
+        wildbench_average: 0.00
+        simpleqa_accuracy_given_attempted: 1.00
        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.9
+        alignment_bench_v1_1_专业能力: 8.70
        alignment_bench_v1_1_数学计算: 0
        alignment_bench_v1_1_基本任务: 0
        alignment_bench_v1_1_逻辑推理: 0
@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench:
        alignment_bench_v1_1_文本写作: 0
        alignment_bench_v1_1_角色扮演: 0
        alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 0
-        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 45
-        compassarena_reason_v2_naive_average: 25
-        compassarena_math_v2_naive_average: 60
-        compassarena_creationv2_zh_naive_average: 35
+        alpaca_eval_helpful_base: 20.00
+        compassarena_language_naive_average: 25.00
+        compassarena_knowledge_naive_average: 55.00
+        compassarena_reason_v2_naive_average: 35.00
+        compassarena_math_v2_naive_average: 55.00
+        compassarena_creationv2_zh_naive_average: 25.00
        followbench_llmeval_en_HSR_AVG: 1
        followbench_llmeval_en_SSR_AVG: 1
        followbench_llmeval_en_HSR_L1: 1
@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
        followbench_llmeval_en_SSR_L3: 1
        followbench_llmeval_en_SSR_L4: 1
        followbench_llmeval_en_SSR_L5: 1
-        simpleqa_f1: 0
+        simpleqa_f1: 0.12

 internlm2_5-7b-hf_fullbench:
    objective:
@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
        drop_accuracy: 62.5
        GPQA_diamond_accuracy: 62.5
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 12.50
+        TheoremQA_score: 18.75
        winogrande_accuracy: 75
        gsm8k_accuracy: 37.5
        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@ -188,23 +188,23 @@ internlm2_5-7b-turbomind_fullbench:
        triviaqa_wiki_1shot_score: 43.75
        nq_open_1shot_score: 43.75
        drop_accuracy: 62.5
-        GPQA_diamond_accuracy: 62.5
+        GPQA_diamond_accuracy: 68.75
        hellaswag_accuracy: 93.75
-        TheoremQA_score: 12.50
+        TheoremQA_score: 18.75
        winogrande_accuracy: 87.5
-        gsm8k_accuracy: 56.25
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
+        gsm8k_accuracy: 62.50
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-        math_accuracy: 18.75
-        wikibench-wiki-single_choice_cncircular_perf_4: 25
+        math_accuracy: 6.25
+        wikibench-wiki-single_choice_cncircular_perf_4: 0.00
        sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 50.00
-        dingo_zh_170_score: 93.75
-        mmlu-other_accuracy: 76.92
-        cmmlu-china-specific_accuracy: 84.17
+        dingo_en_192_score: 37.50
+        dingo_zh_170_score: 100.00
+        mmlu-other_accuracy: 78.37
+        cmmlu-china-specific_accuracy: 83.33
        mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 43.75
-        bbh-multistep_arithmetic_two_score: 56.25
+        bbh-logical_deduction_seven_objects_score: 62.50
+        bbh-multistep_arithmetic_two_score: 50.00
        college_naive_average: 12.5
        college_knowledge_naive_average: 87.5

@ -230,7 +230,7 @@ internlm2_5-7b-turbomind:
        mmlu_naive_average: 71.44
        mmlu_pro_naive_average: 38.18
        openai_humaneval_humaneval_pass@1: 59.76
-        openai_humaneval_v2_humaneval_pass@1: 51.22
+        openai_humaneval_v2_humaneval_pass@1: 57.93
        sanitized_mbpp_score: 55.25
        dingo_en_192_score: 60.94
        dingo_zh_170_score: 67.65
@ -257,17 +257,17 @@ internlm2_5-7b-turbomind:
        mmlu_pro_physics_accuracy: 26.02
        mmlu_pro_psychology_accuracy: 52.76
        mmlu_pro_other_accuracy: 42.21
-        college_naive_average: 10.67
+        college_naive_average: 7.00
        high_naive_average: 6.67
        middle_naive_average: 26.67
-        primary_naive_average: 60
+        primary_naive_average: 64.00
        arithmetic_naive_average: 55
        mathbench-a (average)_naive_average: 31.8
-        college_knowledge_naive_average: 62.34
-        high_knowledge_naive_average: 59.83
+        college_knowledge_naive_average: 58.23
+        high_knowledge_naive_average: 52.51
        middle_knowledge_naive_average: 71.15
-        primary_knowledge_naive_average: 66.55
-        mathbench-t (average)_naive_average: 64.97
+        primary_knowledge_naive_average: 60.48
+        mathbench-t (average)_naive_average: 60.19
    long_context:
        Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
        Single-Needle-Retrieval-EN-32000_naive_average: 100
@ -309,7 +309,7 @@ internlm2_5-7b-chat-turbomind:
        GaokaoBench_weighted_average: 78.6
        math_accuracy: 61
        cmo_fib_accuracy: 11
-        aime2024_accuracy: 6.67
+        aime2024_accuracy: 3.33
        Mathbench_naive_average: 64.23
        wikibench-wiki-single_choice_cncircular_perf_4: 31.32
        cmmlu_naive_average: 74.3
@ -322,7 +322,7 @@ internlm2_5-7b-chat-turbomind:
        lcb_code_generation_pass@1: 17.75
        lcb_code_execution_pass@1: 32.57
        lcb_test_output_pass@1: 26.13
-        bigcodebench_hard_instruct_pass@1: 8.45
+        bigcodebench_hard_instruct_pass@1: 3.38
        bigcodebench_hard_complete_pass@1: 5.06
        teval_naive_average: 80
        SciCode_sub_accuracy: 5.56
@ -384,7 +384,7 @@ internlm2_5-7b-chat-turbomind:
        college_knowledge_naive_average: 67.1
        high_knowledge_naive_average: 70
        middle_knowledge_naive_average: 80
-        primary_knowledge_naive_average: 87
+        primary_knowledge_naive_average: 90.12
        mathbench-t (average)_naive_average: 76
    subjective:
        alignment_bench_v1_1_总分: 5.68
@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
        alpaca_eval_koala: 28.21
        alpaca_eval_oasst: 23.4
        alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 33.75
-        compassarena_language_naive_average: 58.50
+        alpaca_eval_vicuna: 25.00
+        compassarena_language_naive_average: 53.00
        compassarena_knowledge_naive_average: 36
        compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 25.95
+        compassarena_math_v2_naive_average: 16.07
        compassarena_creationv2_zh_naive_average: 43.64
        fofo_test_prompts_overall: 0.35
        fofo_test_prompts_cn_overall: 0.41
@ -524,7 +524,7 @@ qwen2.5-7b-instruct-turbomind:
        humanevalx-python_pass@1: 50
        humanevalx-cpp_pass@1: 42.07
        humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 74.39
+        humanevalx-java_pass@1: 53.05
        humanevalx-js_pass@1: 75
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 8.18
@ -548,7 +548,7 @@ qwen2.5-7b-instruct-turbomind:
        openai_mmmlu_lite_SW-KE_accuracy: 36.42
        openai_mmmlu_lite_YO-NG_accuracy: 32.14
        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
-        college_naive_average: 48
+        college_naive_average: 44.33
        high_naive_average: 59
        middle_naive_average: 78
        primary_naive_average: 85.67
@ -658,7 +658,7 @@ internlm2_5-7b-chat-pytorch:
        college_naive_average: 21
        high_naive_average: 47
        middle_naive_average: 59.67
-        primary_naive_average: 76
+        primary_naive_average: 72.33
        arithmetic_naive_average: 62
        mathbench-a (average)_naive_average: 53.13
        college_knowledge_naive_average: 68.99
@ -688,7 +688,7 @@ qwen2.5-7b-instruct-pytorch:
        gsm8k_accuracy: 91.66
        GaokaoBench_weighted_average: 80.02
        math_accuracy: 73.74
-        cmo_fib_accuracy: 26.44
+        cmo_fib_accuracy: 22.60
        aime2024_accuracy: 13.33
        Mathbench_naive_average: 77.08
        wikibench-wiki-single_choice_cncircular_perf_4: 34
@ -793,8 +793,8 @@ internlm3-8b-instruct-turbomind:
        gsm8k_accuracy: 91.28
        GaokaoBench_weighted_average: 86.59
        math_accuracy: 76.96
-        cmo_fib_accuracy: 35.1
-        aime2024_accuracy: 16.67
+        cmo_fib_accuracy: 38.46
+        aime2024_accuracy: 13.33
        Mathbench_naive_average: 78.96
        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
        cmmlu_naive_average: 83.33
@ -841,7 +841,7 @@ internlm3-8b-instruct-turbomind:
        humanevalx-python_pass@1: 43.9
        humanevalx-cpp_pass@1: 20.12
        humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 74.39
+        humanevalx-java_pass@1: 40.85
        humanevalx-js_pass@1: 65.24
        ds1000_Pandas_accuracy: 16.49
        ds1000_Numpy_accuracy: 34.09
@ -907,7 +907,7 @@ internlm3-8b-instruct-pytorch:
        mmlu_pro_naive_average: 58.16
        openai_humaneval_humaneval_pass@1: 82.32
        sanitized_mbpp_score: 70.04
-        humanevalx_naive_average: 39.76
+        humanevalx_naive_average: 25.49
        ds1000_naive_average: 27.84
        lcb_code_generation_pass@1: 34.5
        lcb_code_execution_pass@1: 48.02
@ -946,7 +946,7 @@ internlm3-8b-instruct-pytorch:
        humanevalx-python_pass@1: 42.68
        humanevalx-cpp_pass@1: 19.51
        humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 72.56
+        humanevalx-java_pass@1: 0.00
        humanevalx-js_pass@1: 64.02
        ds1000_Pandas_accuracy: 14.09
        ds1000_Numpy_accuracy: 35
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@ -12,13 +12,13 @@ chat:
        gsm8k_accuracy: 46.88
        race-high_accuracy: 81.25
    deepseek-r1-distill-llama-8b-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 34.38
        race-high_accuracy: 81.25
    deepseek-r1-distill-qwen-1_5b-turbomind:
-        gsm8k_accuracy: 37.5
+        gsm8k_accuracy: 28.12
        race-high_accuracy: 53.12
    deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 56.25
        race-high_accuracy: 78.12
    gemma2-2b-it-hf:
        gsm8k_accuracy: 50
@ -33,13 +33,13 @@ chat:
        gsm8k_accuracy: 40.62
        race-high_accuracy: 68.75
    gemma-2-9b-it-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
        race-high_accuracy: 84.38
    gemma-2-27b-it-turbomind:
        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    gemma-7b-it-vllm:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 28.12
        race-high_accuracy: 68.75
    internlm2_5-7b-chat-hf:
        gsm8k_accuracy: 84.38
@ -48,25 +48,25 @@ chat:
        gsm8k_accuracy: 65.62
        race-high_accuracy: 87.5
    internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 84.38
+        gsm8k_accuracy: 81.25
        race-high_accuracy: 90.62
    internlm2-chat-1.8b-turbomind:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 25.00
        race-high_accuracy: 84.38
    internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 34.38
        race-high_accuracy: 84.38
    internlm2-chat-7b-lmdeploy:
        gsm8k_accuracy: 59.38
-        race-high_accuracy: 84.38
+        race-high_accuracy: 87.50
    internlm2-chat-7b-sft-turbomind:
        gsm8k_accuracy: 56.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
    internlm3-8b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
        race-high_accuracy: 87.5
    internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 53.12
        race-high_accuracy: 87.50
    llama-3_1-8b-instruct-hf:
        gsm8k_accuracy: 84.38
@ -81,13 +81,13 @@ chat:
        gsm8k_accuracy: 18.75
        race-high_accuracy: 46.88
    llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 84.38
        race-high_accuracy: 90.62
    llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
        race-high_accuracy: 81.25
    llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
        race-high_accuracy: 84.38
    mistral-7b-instruct-v0.2-hf:
        gsm8k_accuracy: 40.62
@ -100,12 +100,12 @@ chat:
        race-high_accuracy: 81.25
    mistral-nemo-instruct-2407-turbomind:
        gsm8k_accuracy: 71.88
-        race-high_accuracy: 78.12
+        race-high_accuracy: 75
    mistral-7b-instruct-v0.1-vllm:
        gsm8k_accuracy: 34.38
        race-high_accuracy: 65.62
    mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 28.12
        race-high_accuracy: 78.12
    qwen2.5-0.5b-instruct-hf:
        gsm8k_accuracy: 34.38
@ -114,7 +114,7 @@ chat:
        gsm8k_accuracy: 53.12
        race-high_accuracy: 90.62
    qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 28.12
        race-high_accuracy: 43.75
    qwen2.5-3b-instruct-turbomind:
        gsm8k_accuracy: 56.25
@ -132,10 +132,10 @@ chat:
        gsm8k_accuracy: 56.25
        race-high_accuracy: 84.38
    qwen2-7b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 87.50
    qwen1.5-0.5b-chat-vllm:
-        gsm8k_accuracy: 3.12
+        gsm8k_accuracy: 6.25
        race-high_accuracy: 53.12
    yi-1.5-6b-chat-hf:
        gsm8k_accuracy: 65.62
@ -144,13 +144,13 @@ chat:
        gsm8k_accuracy: 75
        race-high_accuracy: 93.75
    yi-1.5-6b-chat-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
        race-high_accuracy: 84.38
    yi-1.5-9b-chat-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 78.12
        race-high_accuracy: 93.75
    deepseek-v2_lite-chat-turbomind:
-        gsm8k_accuracy: 37.5
+        gsm8k_accuracy: 43.75
        race-high_accuracy: 71.88
    gemma2-27b-it-hf:
        gsm8k_accuracy: 71.88
@ -165,7 +165,7 @@ chat:
        gsm8k_accuracy: 81.25
        race-high_accuracy: 87.50
    mistral-small-instruct-2409-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 78.12
        race-high_accuracy: 87.50
    phi-4:
        gsm8k_accuracy: 81.25
@ -174,16 +174,16 @@ chat:
        gsm8k_accuracy: 71.88
        race-high_accuracy: 96.88
    qwen2.5-14b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 96.88
    yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 71.88
        race-high_accuracy: 93.75
    deepseek-67b-chat-turbomind:
-        gsm8k_accuracy: 75.00
-        race-high_accuracy: 78.12
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 75.00
    deepseek-r1-distill-qwen-32b-turbomind:
-        gsm8k_accuracy: 25
+        gsm8k_accuracy: 31.25
        race-high_accuracy: 90.62
    llama-3_3-70b-instruct-turbomind:
        gsm8k_accuracy: 93.75
@ -192,19 +192,19 @@ chat:
        gsm8k_accuracy: 87.50
        race-high_accuracy: 93.75
    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 93.75
-        race-high_accuracy: 50.00
+        gsm8k_accuracy: 90.62
+        race-high_accuracy: 53.12
    qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 78.12
        race-high_accuracy: 90.62
    deepseek-r1-distill-llama-70b-turbomind:
-        gsm8k_accuracy: 40.62
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 50.00
+        race-high_accuracy: 87.50
    deepseek-v2_5-1210-turbomind:
        gsm8k_accuracy: 90.62
        race-high_accuracy: 84.38
    mixtral-8x22b-instruct-v0.1-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
        race-high_accuracy: 78.12
    mixtral-8x22b-instruct-v0.1-vllm:
        gsm8k_accuracy: 78.12
@ -222,11 +222,11 @@ base:
        winogrande_accuracy: 71.88
    deepseek-7b-base-turbomind:
        gsm8k_accuracy: 18.75
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 43.75
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 50.00
        winogrande_accuracy: 84.38
    deepseek-moe-16b-base-vllm:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 25.00
        GPQA_diamond_accuracy: 0
        race-high_accuracy: 25
        winogrande_accuracy: 68.75
@ -253,15 +253,15 @@ base:
    gemma-2-9b-turbomind:
        gsm8k_accuracy: 68.75
        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 18.75
-        winogrande_accuracy: 46.88
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 81.25
    gemma-2b-vllm:
        gsm8k_accuracy: 15.62
        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 28.12
        winogrande_accuracy: 68.75
    gemma-7b-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 59.38
        GPQA_diamond_accuracy: 6.25
        race-high_accuracy: 81.25
        winogrande_accuracy: 81.25
@ -276,8 +276,8 @@ base:
        race-high_accuracy: 62.5
        winogrande_accuracy: 78.12
    internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 6.25
-        GPQA_diamond_accuracy: 12.5
+        gsm8k_accuracy: 12.50
+        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 71.88
        winogrande_accuracy: 75
    internlm2_5-7b-turbomind:
@ -286,13 +286,13 @@ base:
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    internlm2-7b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 34.38
+        gsm8k_accuracy: 53.12
+        GPQA_diamond_accuracy: 25.00
        race-high_accuracy: 78.12
        winogrande_accuracy: 71.88
    internlm2-base-7b-turbomind:
-        gsm8k_accuracy: 28.12
-        GPQA_diamond_accuracy: 31.25
+        gsm8k_accuracy: 25.00
+        GPQA_diamond_accuracy: 34.38
        race-high_accuracy: 71.88
        winogrande_accuracy: 62.50
    llama-2-7b-hf:
@ -311,8 +311,8 @@ base:
        race-high_accuracy: 65.62
        winogrande_accuracy: 65.62
    llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 15.62
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 9.38
        race-high_accuracy: 78.12
        winogrande_accuracy: 78.12
    llama-3-8b-turbomind:
@ -332,12 +332,12 @@ base:
        winogrande_accuracy: 71.88
    qwen2.5-1.5b-turbomind:
        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 18.75
-        race-high_accuracy: 75
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 78.12
        winogrande_accuracy: 71.88
    qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 18.75
+        gsm8k_accuracy: 78.12
+        GPQA_diamond_accuracy: 21.88
        race-high_accuracy: 87.5
        winogrande_accuracy: 75.00
    qwen1.5-moe-a2.7b-hf:
@ -361,18 +361,18 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 68.75
    qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 6.25
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 12.50
        race-high_accuracy: 81.25
        winogrande_accuracy: 75
    qwen2-7b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 65.62
        GPQA_diamond_accuracy: 12.5
        race-high_accuracy: 87.5
        winogrande_accuracy: 75
    qwen1.5-0.5b-vllm:
        gsm8k_accuracy: 9.38
-        GPQA_diamond_accuracy: 0
+        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 56.25
        winogrande_accuracy: 59.38
    yi-1.5-6b-hf:
@ -386,7 +386,7 @@ base:
        race-high_accuracy: 87.5
        winogrande_accuracy: 59.38
    yi-1.5-9b-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 87.5
        winogrande_accuracy: 65.62
@ -406,13 +406,13 @@ base:
        race-high_accuracy: 93.75
        winogrande_accuracy: 78.12
    qwen2.5-32b-turbomind:
-        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 28.12
+        gsm8k_accuracy: 90.62
+        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
    deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 34.38
+        gsm8k_accuracy: 62.50
+        GPQA_diamond_accuracy: 31.25
        race-high_accuracy: 78.12
        winogrande_accuracy: 81.25
    llama-3-70b-turbomind:
@ -422,11 +422,11 @@ base:
        winogrande_accuracy: 84.38
    qwen2.5-72b-turbomind:
        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 31.25
+        GPQA_diamond_accuracy: 40.62
        race-high_accuracy: 93.75
        winogrande_accuracy: 87.5
    deepseek-v2-turbomind:
        gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 3.12
        race-high_accuracy: 93.75
        winogrande_accuracy: 81.25
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -17,7 +17,7 @@ on:
        required: false
        description: 'whether to build lmdeploy'
        type:  boolean
-        default: true
+        default: false
      repo_org_lmdeploy:
        required: false
        description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -146,7 +146,7 @@ jobs:
      - name: Prepare - create conda env and install torch - cu12
        uses: nick-fields/retry@v3
        with:
-          max_attempts: 1
+          max_attempts: 3
          timeout_minutes: 120
          command: |
            . ${{env.CONDA_PATH}}/bin/activate
@ -182,7 +182,7 @@ jobs:
          pip list

  daily_run_test_volc:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
@ -222,7 +222,7 @@ jobs:


  daily_run_test_local:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
@ -303,7 +303,7 @@ jobs:
          python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py

  fullbench_run_test:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
    needs: prepare_env
    strategy:
      fail-fast: false
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@ -115,9 +115,15 @@ repos:
        args:
          - --root_folder
          - opencompass/configs/datasets
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.23.1
+    hooks:
+    -   id: gitleaks
+        entry: "gitleaks dir"
+        args: ["--verbose", "--redact=50"]
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -116,9 +116,15 @@ repos:
        args:
          - --root_folder
          - opencompass/configs/datasets
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.23.1
+    hooks:
+    -   id: gitleaks
+        entry: "gitleaks dir"
+        args: ["--verbose", "--redact=50"]
  # - repo: https://github.com/open-mmlab/pre-commit-hooks
  #   rev: v0.2.0  # Use the ref you want to point at
  #   hooks:
  #     - id: check-algo-readme
      # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
--- a/README.md
+++ b/README.md
@ -60,7 +60,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 - **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
+- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
 - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
@ -246,7 +246,7 @@ Currently, OpenCompass have provided standard recommended configurations for dat
 opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat

 # Recommended Evaluation Config based on LLM Judge
-opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
 ```

 If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -60,7 +60,7 @@
 - **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
+- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
@ -237,7 +237,7 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat

  # 基于LLM Judge的推荐配置
-  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+  opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
  ```

  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -122,12 +122,42 @@
    paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
    configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
    configpath_llmjudge: ''
+- MedCalc_Bench:
+    name: MedCalc_Bench
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2406.12036
+    configpath: opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
+    configpath_llmjudge: ''
+- MedXpertQA:
+    name: MedQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2009.13081
+    configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py
 - MedXpertQA:
    name: MedXpertQA
    category: Knowledge / Medicine
    paper: https://arxiv.org/abs/2501.18362
    configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
    configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+- ClinicBench:
+    name: ClinicBench
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2405.00716
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
+- ScienceQA:
+    name: ScienceQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2209.09513
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
+- PubMedQA:
+    name: PubMedQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/1909.06146
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
 - musr:
    name: MuSR
    category: Reasoning
@ -273,7 +303,7 @@
    category: Examination
    paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
    configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen.py
 - anli:
    name: Adversarial NLI
    category: Reasoning
@ -343,6 +373,12 @@
    paper: https://arxiv.org/pdf/2004.05986
    configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
    configpath_llmjudge: ''
+- CARDBiomedBench:
+    name: CARDBiomedBench
+    category: Knowledge / Medicine
+    paper: https://www.biorxiv.org/content/10.1101/2025.01.15.633272v1
+    configpath: opencompass/configs/datasets/CARDBiomedBench
+    configpath_llmjudge: 'opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py'
 - cb:
    name: SuperGLUE / CB
    category: Reasoning
@ -575,6 +611,12 @@
    paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
    configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
    configpath_llmjudge: ''
+- humaneval_pro:
+    name: HumanEval Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+    configpath_llmjudge: ''
 - hungarian_math:
    name: Hungarian_Math
    category: Math
@ -659,6 +701,12 @@
    paper: ''
    configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
    configpath_llmjudge: ''
+- mbpp_pro:
+    name: MBPP Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+    configpath_llmjudge: ''
 - mgsm:
    name: MGSM
    category: Language / Math
@ -671,6 +719,12 @@
    paper: https://arxiv.org/pdf/2009.03300
    configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+- SciEval:
+    name: SciEval
+    category: Understanding
+    paper: https://arxiv.org/pdf/2308.13149
+    configpath: opencompass/configs/datasets/SciEval/SciEval_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/SciEval/SciEval_llm_judge_gen.py
 - mmlu_cf:
    name: MMLU-CF
    category: Understanding
@ -739,6 +793,12 @@
    paper: https://arxiv.org/pdf/1911.11641v1
    configpath: opencompass/configs/datasets/piqa/piqa_gen.py
    configpath_llmjudge: ''
+- ProteinLMBench:
+    name: ProteinLMBench
+    category: Knowledge / Biology (Protein)
+    paper: https://arxiv.org/abs/2406.05540
+    configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py
 - py150:
    name: py150
    category: Code
@ -1023,3 +1083,33 @@
    paper: https://arxiv.org/pdf/2402.09391
    configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
    configpath_llmjudge: ''
+- SciKnowEval:
+    name: SciKnowEval
+    category: Science
+    paper: https://arxiv.org/abs/2406.09098
+    configpath: opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
+    configpath_llmjudge: opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
+- internsandbox:
+    name: InternSandbox
+    category: Reasoning/Code/Agent
+    paper: ''
+    configpath: opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
+    configpath_llmjudge: ''
+- nejmaibench:
+    name: nejmaibench
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2308.04709
+    configpath: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
+- medbullets:
+    name: Medbullets
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2402.18060
+    configpath: opencompass/configs/datasets/Medbullets/medbullets_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
+- medmcqa:
+    name: medmcqa
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2203.14371
+    configpath: opencompass/configs/datasets/medmcqa/medmcqa_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@ -278,7 +278,7 @@ Here's an example of how to configure the CascadeEvaluator:

 ```python
 # Define a rule-based evaluator
-rule_evaluator = dict(type=MATHEvaluator)
+rule_evaluator = dict(type=MATHVerifyEvaluator)

 # Define an LLM judge evaluator
 llm_judge_evaluator = dict(
--- a/docs/en/advanced_guides/math_verify.md
+++ b/docs/en/advanced_guides/math_verify.md
@ -2,7 +2,7 @@

 ## Introduction

-Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
+Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHVerifyEvaluator components.

 ## Dataset Format

@ -61,7 +61,7 @@ math_infer_cfg = dict(

 ```python
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 ```

@ -86,11 +86,11 @@ math_datasets = [
 ]
 ```

-## MATHEvaluator
+## MATHVerifyEvaluator

-The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
+The MATHVerifyEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.

-The MATHEvaluator implements:
+The MATHVerifyEvaluator implements:

 1. Extracts answers from both predictions and references using LaTeX extraction
 2. Handles various LaTeX formats and environments
@ -133,7 +133,7 @@ Here's a complete example of how to set up math evaluation:
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHVerifyEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
@ -160,7 +160,7 @@ math_infer_cfg = dict(

 # Evaluation configuration
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )

 # Dataset configuration
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@ -277,7 +277,7 @@ OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式

 ```python
 # 定义规则式评估器
-rule_evaluator = dict(type=MATHEvaluator)
+rule_evaluator = dict(type=MATHVerifyEvaluator)

 # 定义LLM评判器
 llm_judge_evaluator = dict(
--- a/docs/zh_cn/advanced_guides/math_verify.md
+++ b/docs/zh_cn/advanced_guides/math_verify.md
@ -2,7 +2,7 @@

 ## 简介

-数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
+数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHVerifyEvaluator 组件提供了一种便捷的数学推理评测方式。

 ## 数据集格式

@ -61,7 +61,7 @@ math_infer_cfg = dict(

 ```python
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 ```

@ -86,11 +86,11 @@ math_datasets = [
 ]
 ```

-## MATHEvaluator
+## MATHVerifyEvaluator

-MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
+MATHVerifyEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。

-MATHEvaluator 具有以下功能：
+MATHVerifyEvaluator 具有以下功能：

 1. 使用 LaTeX 提取器从预测和参考答案中提取答案
 2. 处理各种 LaTeX 格式和环境
@ -133,7 +133,7 @@ MATHEvaluator 具有以下功能：
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
@ -160,7 +160,7 @@ math_infer_cfg = dict(

 # 评测配置
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )

 # 数据集配置
--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
@ -7,9 +7,12 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.evaluator import (
+    GenericLLMEvaluator,
+    CascadeEvaluator,
+    MATHVerifyEvaluator,
+)
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
    MATHDataset,
    math_postprocess_v2,
@ -94,7 +97,7 @@ llm_judge_evaluator =   dict(
        judge_cfg=dict(),
    )

-rule_evaluator =dict(type=MATHEvaluator)
+rule_evaluator =dict(type=MATHVerifyEvaluator)
 cascade_evaluator = dict(type=CascadeEvaluator,
                   llm_evaluator=llm_judge_evaluator,
                   rule_evaluator=rule_evaluator,
--- a/examples/eval_codebench_full.py
+++ b/examples/eval_codebench_full.py
@ -0,0 +1,155 @@
+# This config is used to test all the code benchmarks
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    # Datasets Part
+    # bigcodebench
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
+        bigcodebench_full_instruct_datasets
+    )
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
+        bigcodebench_hard_instruct_datasets
+    )
+    # livecodebench code generation lite v5
+    from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
+        LCB_datasets
+    )
+    # huamneval series
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
+        humaneval_datasets
+    )
+    from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
+        humanevalpro_datasets
+    )
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
+        humanevalx_datasets
+    )
+    from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
+        humaneval_plus_datasets
+    )
+    # mbpp series
+    from opencompass.configs.datasets.mbpp.mbpp_gen import (
+        mbpp_datasets
+    )
+    from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
+        mbpppro_datasets
+    )
+    # multipl-e
+    from opencompass.configs.datasets.multipl_e.multiple_gen import (
+        multiple_datasets
+    )
+    # ds1000
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
+        ds1000_datasets
+    )
+
+    # Models Part
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.ds1000 import (
+        ds1000_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.multipl_e import (
+        multiple_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.humanevalx import (
+        humanevalx_summary_groups,
+    )
+
+# models config
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for model in models:
+    model['max_seq_len'] = 16384
+    model['max_out_len'] = 8192
+
+# datasets config
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+for item in humanevalx_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/humanevalx'
+    item['eval_cfg']['evaluator']['port'] = ''
+for item in ds1000_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/ds1000'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
+
+
+# summary
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+summary_groups.append(
+    {'name': 'humanevalx', 
+    'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
+)
+summarizer = dict(
+    dataset_abbrs = [
+        ['bigcodebench_hard_instruct', 'pass@1'],
+        ['bigcodebench_full_instruct', 'pass@1'],
+        ['lcb_code_generation', 'pass@1'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['mbpp', 'score'],
+        ['humaneval_pro', 'pass@1'],
+        ['mbpp_pro', 'pass@1'],
+        ['humaneval_plus', 'humaneval_plus_pass@1'],
+        ['multiple', 'naive_average'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        '',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-java',
+        'humanevalx-js',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+        '',
+        'humaneval-multiple-cpp', 
+        'humaneval-multiple-cs', 
+        'humaneval-multiple-go', 
+        'humaneval-multiple-java', 
+        'humaneval-multiple-rb', 
+        'humaneval-multiple-js', 
+        'humaneval-multiple-php', 
+        'humaneval-multiple-r', 
+        'humaneval-multiple-rs', 
+        'humaneval-multiple-sh',
+        '',
+        'mbpp-multiple-cpp', 
+        'mbpp-multiple-cs', 
+        'mbpp-multiple-go', 
+        'mbpp-multiple-java', 
+        'mbpp-multiple-rb', 
+        'mbpp-multiple-js', 
+        'mbpp-multiple-php', 
+        'mbpp-multiple-r', 
+        'mbpp-multiple-rs', 
+        'mbpp-multiple-sh'
+    ],
+    summary_groups=summary_groups,
+)
+
+work_dir = 'outputs/code'
--- a/examples/eval_judge_dataset_all.py
+++ b/examples/eval_judge_dataset_all.py
@ -0,0 +1,61 @@
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
+    from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
+    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
+    from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
+
+    from opencompass.configs.summarizers.judgedataset_all import summarizer
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+
+
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+
+
+work_dir = './outputs/judge_dataset_all/'
--- a/examples/eval_qwen3.py
+++ b/examples/eval_qwen3.py
@ -0,0 +1,142 @@
+
+import os.path as osp
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
+    from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
+    from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
+
+#######################################################################
+#                          PART 0  Meta Info                          #
+#######################################################################
+
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], 
+)
+
+
+judge_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://x.x.x.x:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=8,
+        batch_size=256,
+        temperature=0.001,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        # verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+        # max_seq_len=49152,
+        mode='mid',
+        retry=10
+)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+
+repeated_info = [
+    (math_datasets, 4),
+    (aime2024_datasets, 32),
+    (aime2025_datasets, 32),
+]
+
+for datasets_, num in repeated_info:
+    for dataset_ in datasets_:
+        dataset_['n'] = num
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    try:
+        if 'judge_cfg' in item['eval_cfg']['evaluator']:
+           item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+        elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+            item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+    except:
+        pass
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        ['math_prm800k_500', 'accuracy (4 runs average)'],
+        ['aime2024', 'accuracy (32 runs average)'],
+        ['aime2025', 'accuracy (32 runs average)'],
+        ['livemathbench_hard', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['olymmath', 'naive_average'],
+    ],
+    summary_groups = sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+    ),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+models += [
+
+    dict(
+        abbr='Qwen_Qwen3-235B-A22B',
+        type=OpenAISDK,
+        path='Qwen/Qwen3-235B-A22B',
+        key='sk-admin',
+        openai_api_base=[
+            'http://106.15.231.215:40007/v1/',
+        ],
+        meta_template=dict(
+            # begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                # XXX: all system roles are mapped to human in purpose
+                dict(role='BOT', api_role='BOT', generate=True),
+            ]
+        ),
+        query_per_second=16,
+        batch_size=128,
+        # batch_size=1,
+        temperature=0.6,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4',
+        # verbose=True,
+        max_out_len=32768,
+        max_seq_len=32768,
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
+)
+
+base_exp_dir = 'outputs/qwen3_reasoning'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@ -12,8 +12,8 @@ from mmengine.config import Config, DictAction
 from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
 from opencompass.runners import SlurmRunner
 from opencompass.summarizers import DefaultSummarizer
-from opencompass.utils import (LarkReporter, get_logger, read_from_station,
-                               save_to_station)
+from opencompass.utils import (LarkReporter, get_logger, pretty_print_config,
+                               read_from_station, save_to_station)
 from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
                                   get_config_from_arg)

@ -94,6 +94,11 @@ def parse_args():
        help='Use the custom config directory instead of config/ to '
        'search the configs for datasets, models and summarizers',
        type=str)
+    parser.add_argument(
+        '--config-verbose',
+        default=False,
+        action='store_true',
+        help='Whether to print the config in verbose mode.')
    parser.add_argument('-l',
                        '--lark',
                        help='Report the running status to lark bot',
@ -131,7 +136,7 @@ def parse_args():
        'correctness of each sample, bpb, etc.',
        action='store_true',
    )
-
+    # for the results persistence
    parser.add_argument('-sp',
        '--station-path',
        help='Path to your results station.',
@ -150,7 +155,12 @@ def parse_args():
             'data station.',
        action='store_true',
    )
-
+    # for evaluation with multiple runs
+    parser.add_argument('--dataset-num-runs',
+        help='How many runs for one dataset',
+        type=int,
+        default=1,
+    )

    # set srun args
    slurm_parser = parser.add_argument_group('slurm_args')
@ -299,6 +309,11 @@ def main():
        content = f'{getpass.getuser()}\'s task has been launched!'
        LarkReporter(cfg['lark_bot_url']).post(content)

+
+    # print config if specified --config-verbose
+    if args.config_verbose:
+        pretty_print_config(cfg)
+
    # infer
    if args.mode in ['all', 'infer']:
        # When user have specified --slurm or --dlc, or have not set
--- a/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py
+++ b/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py
@ -0,0 +1,101 @@
+from opencompass.datasets import CARDBiomedBenchDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+ZERO_SHOT_PROMPT = 'You are an expert in {expert}.\n{question}\n'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: You are an expert in {expert}.\n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'answer',
+        'Bio_Category',
+        'SQL_Category',
+        'uuid',
+        'template uuid',
+        'expert',
+    ],
+    output_column='answer',
+)
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CARDBiomedBenchDataset,
+            path='NIH-CARD/CARDBiomedBench',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+cardbiomedbench_dataset = dict(
+    type=CARDBiomedBenchDataset,
+    abbr='cardbiomedbench',
+    path='NIH-CARD/CARDBiomedBench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+cardbiomedbench_datasets = [cardbiomedbench_dataset]
--- a/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
+++ b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ClinicBench_llmjudge_gen_d09668 import ClinicBench_datasets
--- a/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py
+++ b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py
@ -0,0 +1,100 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ClinicBench import ClinicBenchDataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+ClinicBench_datasets = []
+
+ClinicBench_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+ClinicBench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ClinicBench_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ClinicBenchDataset,
+            path='xuxuxuxuxu/Pharmacology-QA',
+            reader_cfg=ClinicBench_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+ClinicBench_datasets.append(
+    dict(
+        abbr=f'ClinicBench',
+        type=ClinicBenchDataset,
+        path='xuxuxuxuxu/Pharmacology-QA',
+        reader_cfg=ClinicBench_reader_cfg,
+        infer_cfg=ClinicBench_infer_cfg,
+        eval_cfg=ClinicBench_eval_cfg,
+    )
+)
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py
@ -1,31 +1,27 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.utils import xml_tag_postprocessor
+from opencompass.datasets import HLEDataset

-aime2024_reader_cfg = dict(
-    input_columns=['question'], 
-    output_column='answer'
-)
+# ----------------------------- Detailed Config -----------------------------

+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')

-aime2024_infer_cfg = dict(
+math_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
-                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
-            ],
-        )
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=2048)
+    inferencer=dict(type=GenInferencer),
 )

-
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    
@ -35,23 +31,20 @@ GRADER_TEMPLATE = """
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
-
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
-
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
-
-
-    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    
    Judging the correctness of candidates' answers:
 """.strip()

-aime2024_eval_cfg = dict(
+# Evaluation configuration
+math_eval_cfg = dict(
    evaluator=dict(
        type=GenericLLMEvaluator,
        prompt_template=dict(
@ -71,25 +64,25 @@ aime2024_eval_cfg = dict(
            ]),
        ),
        dataset_cfg=dict(
-            type=Aime2024Dataset,
-            path='opencompass/aime2024',
-            reader_cfg=aime2024_reader_cfg,
+            type=HLEDataset,
+            path='cais/hle',
+            reader_cfg=math_reader_cfg,
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
-        pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
    ),
    pred_role='BOT',
 )

-aime2024_datasets = [
+
+hle_datasets = [
    dict(
-        abbr='aime2024',
-        type=Aime2024Dataset,
-        path='opencompass/aime2024',
-        reader_cfg=aime2024_reader_cfg,
-        infer_cfg=aime2024_infer_cfg,
-        eval_cfg=aime2024_eval_cfg,
-        mode='singlescore',
+        type=HLEDataset,
+        abbr='hle_llmjudge',
+        path='cais/hle',
+        category='Biology/Medicine',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
    )
 ]
--- a/opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
+++ b/opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
@ -0,0 +1,57 @@
+from opencompass.datasets import MedCalc_BenchDataset, MedCalcOfficial_Evaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+ZERO_SHOT_PROMPT = 'You are a helpful assistant for calculating a score for a given patient note. Please think step-by-step to solve the question and then generate the required score. Your output should only contain a JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}. \n Here is the patient note:\n{patient_note}\n\nHere is the task:\n{question}\n\nPlease directly output the JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}:'
+# Reader configuration
+reader_cfg = dict(
+        input_columns=[
+        'row_number',
+        'calculator_id',
+        'calculator_name',
+        'category',
+        'note_id',
+        'output_type',
+        'note_type',
+        'patient_note',
+        'question',
+        'relevant_entities',
+        'ground_truth_answer',
+        'lower_limit',
+        'upper_limit',
+        'ground_truth_explanation'
+    ],
+    output_column='ground_truth_answer',
+)
+
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+            dict(role='HUMAN',prompt=ZERO_SHOT_PROMPT),
+        ])
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedCalcOfficial_Evaluator),
+    pred_role='BOT',
+)
+medcal_bench_dataset = dict(
+    type=MedCalc_BenchDataset,
+    abbr='medcal_bench_official_zero_shot_eval',
+    path='ncbi/MedCalc-Bench-v1.0',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medcal_bench_datasets = [medcal_bench_dataset]
--- a/opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
+++ b/opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
@ -0,0 +1,63 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets.MedQA import MedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+
+MedQA_datasets = []
+
+MedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+MedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+MedQA_subsets = {
+    'US': 'xuxuxuxuxu/MedQA_US_test',
+    'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
+    'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
+}
+
+for split in list(MedQA_subsets.keys()):
+
+    MedQA_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
+    )
+
+    MedQA_datasets.append(
+        dict(
+            abbr=f'MedQA_{split}',
+            type=MedQADataset,
+            path=MedQA_subsets[split],
+            reader_cfg=MedQA_reader_cfg,
+            infer_cfg=MedQA_infer_cfg,
+            eval_cfg=MedQA_eval_cfg,
+        )
+    )
--- a/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
+++ b/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
@ -0,0 +1,108 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.MedQA import MedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+MedQA_datasets = []
+
+MedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+MedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+MedQA_subsets = {
+    'US': 'xuxuxuxuxu/MedQA_US_test',
+    'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
+    'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
+}
+
+for split in list(MedQA_subsets.keys()):
+
+    MedQA_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MedQADataset,
+                path=MedQA_subsets[split],
+                reader_cfg=MedQA_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+    )
+
+    MedQA_datasets.append(
+        dict(
+            abbr=f'MedQA_{split}',
+            type=MedQADataset,
+            path=MedQA_subsets[split],
+            reader_cfg=MedQA_reader_cfg,
+            infer_cfg=MedQA_infer_cfg,
+            eval_cfg=MedQA_eval_cfg,
+        )
+    )
--- a/opencompass/configs/datasets/Medbullets/medbullets_gen.py
+++ b/opencompass/configs/datasets/Medbullets/medbullets_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medbullets_gen_60c8f5 import medbullets_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py
+++ b/opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py
@ -0,0 +1,59 @@
+from opencompass.datasets import MedbulletsDataset, MedbulletsEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'question_type',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedbulletsEvaluator),
+    pred_role='BOT',
+)
+medbullets_dataset = dict(
+    type=MedbulletsDataset,
+    abbr='medbullets',
+    path='opencompass/medbullets',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medbullets_datasets = [medbullets_dataset]
--- a/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
+++ b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medbullets_llmjudge_gen_60c8f5 import medbullets_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py
+++ b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py
@ -0,0 +1,106 @@
+from opencompass.datasets import MedbulletsDataset, medbullets_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'question_type',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedbulletsDataset,
+            path='opencompass/medbullets',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=medbullets_llmjudge_postprocess),
+    ),
+)
+
+
+medbullets_dataset = dict(
+    type=MedbulletsDataset,
+    abbr='medbullets',
+    path='opencompass/medbullets',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medbullets_datasets = [medbullets_dataset]
--- a/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py
@ -0,0 +1,109 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import OlymMATHDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+
+olymmath_datasets = []
+
+for sub_set in sub_sets:
+    math_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=MATHVerifyEvaluator,
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=OlymMATHDataset,
+                    path='RUC-AIBOX/OlymMATH',
+                    reader_cfg=math_reader_cfg,
+                    subset=sub_set,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False,
+        ),
+    )
+    olymmath_datasets.append(
+        dict(
+            type=OlymMATHDataset,
+            abbr=f'olymmath_{sub_set}',
+            path='RUC-AIBOX/OlymMATH',
+            reader_cfg=math_reader_cfg,
+            infer_cfg=math_infer_cfg,
+            eval_cfg=math_eval_cfg,
+            subset=sub_set,
+            n=1
+        )
+    )
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py
@ -0,0 +1,114 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import (
+    GenericLLMEvaluator,
+    CascadeEvaluator,
+    MATHVerifyEvaluator
+)
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbench_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=MATHVerifyEvaluator,
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=OlympiadBenchDataset,
+                    path='opencompass/OlympiadBench',
+                    name=_name,
+                    reader_cfg=olympiadbench_reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False
+        )
+    )
+
+    olympiadbench_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+            n=1,
+        )
+    )
--- a/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py
+++ b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset, ProteinLMBenchEvaluator
+
+QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=['question', 'start', 'end', 'options'],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=QUERY_TEMPLATE
+                )
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=ProteinLMBenchEvaluator),
+)
+
+proteinlmbench_dataset = dict(
+    abbr='ProteinLMBench',
+    type=ProteinLMBenchDataset,
+    path='tsynbio/ProteinLMBench',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg
+)
+
+proteinlmbench_datasets = [proteinlmbench_dataset]
--- a/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py
+++ b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py
@ -0,0 +1,89 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset
+
+QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+reader_cfg = dict(
+    input_columns=['question', 'start', 'end', 'options'],
+    output_column='label',
+)
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ProteinLMBenchDataset,
+            path='tsynbio/ProteinLMBench',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+proteinlmbench_dataset = dict(
+    abbr='ProteinLMBench',
+    type=ProteinLMBenchDataset,
+    path='tsynbio/ProteinLMBench',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg
+)
+
+proteinlmbench_datasets = [proteinlmbench_dataset]
--- a/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
+++ b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .PubMedQA_llmjudge_gen_f00302 import PubMedQA_datasets
--- a/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py
+++ b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py
@ -0,0 +1,94 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.PubMedQA import PubMedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{choices}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+PubMedQA_datasets = []
+
+PubMedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+PubMedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+PubMedQA_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=PubMedQADataset,
+            path='qiaojin/PubMedQA',
+            reader_cfg=PubMedQA_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+PubMedQA_datasets.append(
+    dict(
+        abbr=f'PubMedQA',
+        type=PubMedQADataset,
+        path='qiaojin/PubMedQA',
+        reader_cfg=PubMedQA_reader_cfg,
+        infer_cfg=PubMedQA_infer_cfg,
+        eval_cfg=PubMedQA_eval_cfg,
+    )
+)
--- a/opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py
+++ b/opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py
@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets import SciEvalDataset  
+
+# 只评测 biology + multiple-choice 的 test split
+_hint = ('Given a question and four options, please select the right answer. '
+         "Your answer should be 'A', 'B', 'C' or 'D'.")
+category = [
+    'biology',
+]
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+            ),
+            dict(role='BOT', prompt='{target}\n')
+        ]),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=GenInferencer),
+)
+
+scieval_eval_cfg = dict(
+    evaluator=dict(type=AccwithDetailsEvaluator),
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+
+scieval_datasets = [
+    dict(
+        abbr='scieval_biology',
+        type=SciEvalDataset,
+        path='OpenDFM/SciEval',
+        name='default',
+        category=category, 
+        reader_cfg=scieval_reader_cfg,
+        infer_cfg=scieval_infer_cfg,
+        eval_cfg=scieval_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py
+++ b/opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py
@ -0,0 +1,130 @@
+# SciEval_lifescience_llmjudge_gen.py
+
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import SciEvalDataset
+
+with read_base():
+    from .SciEval_lifescience_sets import SciEval_lifescience_subsets
+    
+category = [
+    'biology',
+]
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+GRADER_TEMPLATE = """
+Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+Here are some evaluation criteria:
+1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+A: CORRECT 
+B: INCORRECT
+Just return the letters "A" or "B", with no text around it.
+
+Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+<Original Question Begin>: {input}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+<Original Question End>
+
+<Gold Target Begin>:
+{target}
+<Gold Target End>
+
+<Predicted Answer Begin>:
+{prediction}
+<Predicted End>
+
+Judging the correctness of candidates' answers:
+""".strip()
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_datasets = []
+for name in SciEval_lifescience_subsets:
+    scieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ]
+            )
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    scieval_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=(
+                                'You are a helpful assistant who evaluates the correctness '
+                                "and quality of models' outputs."
+                            ),
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=SciEvalDataset,
+                path='OpenDFM/SciEval',
+                name='default',
+                reader_cfg=scieval_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    scieval_datasets.append(
+        dict(
+            abbr=f'scieval_lifescience_{name}_llmjudge',
+            type=SciEvalDataset,
+            path='OpenDFM/SciEval',
+            name='default',
+            category=category, 
+            reader_cfg=scieval_reader_cfg,
+            infer_cfg=scieval_infer_cfg,
+            eval_cfg=scieval_eval_cfg,
+            mode='singlescore',
+        )
+    )
--- a/opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py
+++ b/opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py
@ -0,0 +1,6 @@
+SciEval_lifescience_subsets = [
+    'biology',        # 大学生物学
+    'physics',
+    'chemistry'
+
+]
--- a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
+++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
@ -0,0 +1,92 @@
+from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+ZERO_SHOT_PROMPT = '{q4}'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'prompt',
+        'question',
+        'choices',
+        'label',
+        'answerKey',
+        'type',
+        'domain',
+        'details',
+        'answer',
+        'q4'
+    ],
+    output_column='answerKey',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=SciKnowEvalEvaluator),
+    pred_role='BOT',
+)
+sciknoweval_dataset_biology = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_biology',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='biology',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_chemistry = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_chemistry',
+    path='hicai-zju/SciKnowEval',
+    subset='chemistry',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_material = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_material',
+    path='hicai-zju/SciKnowEval',
+    subset='material',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_physics = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_physics',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='physics',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+
+sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
--- a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
+++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
@ -0,0 +1,232 @@
+from opencompass.datasets import SciKnowEvalDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+ZERO_SHOT_PROMPT = '{q4}'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {q4}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answerKey}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'prompt',
+        'question',
+        'choices',
+        'label',
+        'answerKey',
+        'type',
+        'domain',
+        'details',
+        'answer',
+        'q4'
+    ],
+    output_column='answerKey',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg_biology = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            subset='biology',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_chemistry = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='chemistry',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_material = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='material',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_physics = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='physics',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+sciknoweval_dataset_biology = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_biology_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='biology',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_biology,
+)
+
+sciknoweval_dataset_chemistry = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_chemistry_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    subset='chemistry',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_chemistry,
+)
+sciknoweval_dataset_material = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_material_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    subset='material',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_material,
+)
+
+
+sciknoweval_dataset_physics = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_physics_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='physics',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_physics,
+)
+sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
--- a/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
+++ b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ScienceQA_llmjudge_gen_f00302 import ScienceQA_datasets
--- a/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py
+++ b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py
@ -0,0 +1,94 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ScienceQA import ScienceQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{choices}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+ScienceQA_datasets = []
+
+ScienceQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+ScienceQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ScienceQA_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ScienceQADataset,
+            path='derek-thomas/ScienceQA',
+            reader_cfg=ScienceQA_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+ScienceQA_datasets.append(
+    dict(
+        abbr=f'ScienceQA',
+        type=ScienceQADataset,
+        path='derek-thomas/ScienceQA',
+        reader_cfg=ScienceQA_reader_cfg,
+        infer_cfg=ScienceQA_infer_cfg,
+        eval_cfg=ScienceQA_eval_cfg,
+    )
+)
--- a/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py
@ -0,0 +1,118 @@
+"""
+Summary: A config for AIME-2024 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import Aime2024Dataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+
+aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator= dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    parallel=False,
+)
+
+
+aime2024_eval_cfg = dict(
+    evaluator=cascade_evaluator,
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        n=1,# Evaluate the dataset with n times
+    )
+]
--- a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
--- a/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py
@ -0,0 +1,115 @@
+"""
+Summary: A config for AIME-2025 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2025_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/aime2025',
+            reader_cfg=aime2025_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    parallel=False,
+)
+aime2025_eval_cfg = dict(
+    evaluator=cascade_evaluator,
+)
+
+aime2025_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='aime2025',
+        path='opencompass/aime2025',
+        reader_cfg=aime2025_reader_cfg,
+        infer_cfg=aime2025_infer_cfg,
+        eval_cfg=aime2025_eval_cfg,
+        n=1,
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py
@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_full_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer))
+
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_full_instruct_datasets = [
+    dict(abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2',
+         n=5,
+         k=3)
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+       retriever=dict(type=ZeroRetriever),
+       inferencer=dict(type=GenInferencer)
+)
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+        n=5,
+        k=3
+    )
+]
--- a/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py
@ -0,0 +1,118 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+# openai_simple_eval prompt
+align_prompt = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=align_prompt),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+
+
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=AccEvaluator,
+                pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'),
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=GPQADataset,
+                    path='./data/gpqa/',
+                    name=gpqa_subsets[split],
+                    reader_cfg=gpqa_reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False,
+        ),
+    )
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg,
+            mode='singlescore',
+        )
+    )
--- a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py
@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py
+++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py
@ -0,0 +1,39 @@
+# THIS SHALL ALSO BE DEPRECATED
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+
+humaneval_plus_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/humaneval_pro/README.md
+++ b/opencompass/configs/datasets/humaneval_pro/README.md
@ -0,0 +1,17 @@
+# HumanEval pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     67   |
+|  deepseek-v2-lite-chat-hf  |     35   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     28   |
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .humaneval_pro_gen_3dc067 import humanevalpro_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,)
+]
--- a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py
+++ b/opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py
@ -0,0 +1,43 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalx_eval_cfg_dict = {
+    lang : dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001),  # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
+        pred_role='BOT')
+    for lang in ['python', 'cpp', 'go', 'java', 'js']   # do not support rust now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='./data/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg,
+        eval_cfg=humanevalx_eval_cfg_dict[lang],
+        n=5,
+        k=3)
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+]
--- a/opencompass/configs/datasets/internsandbox/internsandbox_gen.py
+++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .internsandbox_gen_44b982 import internsandbox_datasets
--- a/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
+++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
@ -0,0 +1,59 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import InternSandboxDataset, InternSandboxEvaluator
+
+
+_SANDBOXS_ = ['aquarium', 'arc', 'arrowmaze', 'bbehboardgameqa', 'bbehbooleanexpressions', 'BbehDyckLanguages', 'BbehGeometricShapes', 'BbehMultistepArithmetic', 'bbehobjectcounting', 'bbehobjectproperties', 'bbehshuffobject', 'BbehWebOfLies', 'BbehWordSorting', 'binairo', 'calcudoku', 'campsite', 'cipher', 'cryptomath', 'dominosa', 'futoshiki', 'galaxies', 'game24', 'kakurasu', 'korLogicAnalogicalReasoning', 'korLogicCanonicalPropositions', 'korLogicCooperativePrinciple', 'korLogicDefinitions', 'korLogicDerivativeReasoningOfPropositionalLogic', 'korLogicDisjunctiveNormalFormAndConjunctiveNormalForm', 'korLogicDynamicLogic', 'korLogicEnumerativeInductiveReasoning', 'korLogicEpistemicLogic', 'korLogicEquivalenceCalculus', 'korLogicFigureOfTheSyllogism', 'korLogicFormalFallacies', 'korLogicInductionParadox', 'korLogicLogicalMethodsForExploringCauseAndEffectRelationships', 'korLogicPredicateLogicFormalization', 'korLogicPropositionalLogicConcepts', 'korLogicPropositionalLogicFormalization', 'korLogicResolution', 'korLogicSpeechActs', 'korLogicStatisticalReasoning', 'korLogicTemporalPropositions', 'korLogicTruthValueModalPropositions', 'korOperationUnicode20ac', 'korOperationUnicode2295', 'korOperationUnicode25a0', 'korOperationUnicode25a1', 'korOperationUnicode25b3', 'korOperationUnicode25bd', 'korOperationUnicode25cb', 'korOperationUnicode25ce', 'korOperationUnicode25cf', 'korOperationUnicode2605', 'korOperationUnicodeffe0', 'korOperationUnicodeffe1', 'korPuzzle24Points', 'korPuzzleArrowMaze', 'korPuzzleCalcudoko', 'korPuzzleCampsite', 'korPuzzleConnectWords', 'korPuzzleCryptoMath', 'korPuzzleKukurasu', 'korPuzzleLogicPuzzle', 'korPuzzleSkyscrapers', 'korPuzzleWordBrainTeasers', 'korPuzzleWordLadder', 'korPuzzleWordRootsAndAffixes', 'korPuzzleWordscapes', 'korPuzzleWordSearch', 'LightUp', 'maze', 'minesweeper', 'nonograms', 'starbattle', 'stitches', 'sudoku', 'tents', 'thermometers']
+
+internsandbox_reader_cfg = dict(
+    input_columns=['prompt'], 
+    output_column='ground_truth'
+)
+
+internsandbox_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are a helpful assistant.',
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN', 
+                    prompt='{prompt}'
+                ),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+internsandbox_eval_cfg = {
+    sandbox: dict(
+        evaluator=dict(
+            type=InternSandboxEvaluator,
+            short_penalty=False,
+            format_penalty=False,
+        ),
+        pred_role='BOT',
+    ) for sandbox in _SANDBOXS_
+}
+
+internsandbox_datasets = [
+    dict(
+        type=InternSandboxDataset,
+        abbr=f'internsandbox-{sandbox}',
+        path='./data/InternSandboxBenchmark_verified_V0.3.1/',
+        local_mode=True,
+        sandbox=sandbox,
+        reader_cfg=internsandbox_reader_cfg,
+        infer_cfg=internsandbox_infer_cfg,
+        eval_cfg=internsandbox_eval_cfg[sandbox],
+    ) for sandbox in _SANDBOXS_
+]
--- a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_xml_gen_17854d.py
+++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_xml_gen_17854d.py
@ -1,17 +1,28 @@
+"""
+Summary: A config for KoR-Bench Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - korbenchEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from datasets import parallel
 from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.utils import xml_tag_postprocessor

 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']

-
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
-    
+
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
@ -30,7 +41,7 @@ GRADER_TEMPLATE = """
    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
-    
+
    Judging the correctness of candidates' answers:
 """.strip()

@ -50,7 +61,7 @@ for category in categories:
            round=[
                dict(
                    role='HUMAN',
-                    prompt='{prompt}' # f-string
+                    prompt='{prompt}'  # f-string
                )
            ]
        )
@ -66,41 +77,46 @@ for category in categories:
    infer_cfg = dict(
        prompt_template=prompt_template,
        retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=1024),
+        inferencer=dict(type=GenInferencer),
    )

    # Evaluation configuration
    eval_cfg = dict(
        evaluator=dict(
-            type=GenericLLMEvaluator,
-            prompt_template=dict(
-                type=PromptTemplate,
-                template=dict(
-                begin=[
-                    dict(
-                        role='SYSTEM',
-                        fallback_role='HUMAN',
-                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
-                ],
-                    round=[
-                    dict(
-                        role='HUMAN',
-                        prompt = GRADER_TEMPLATE
-                    ),
-                ]),
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=korbenchEvaluator,
            ),
-            dataset_cfg=dict(
-                type=korbenchDataset,
-                path='opencompass/korbench',
-                prompt_mode='0_shot',
-                category=category,
-                reader_cfg=reader_cfg,
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                        begin=[
+                            dict(
+                                role='SYSTEM',
+                                fallback_role='HUMAN',
+                                prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                        ],
+                        round=[
+                            dict(
+                                role='HUMAN',
+                                prompt=GRADER_TEMPLATE
+                            ),
+                        ]),
+                ),
+                dataset_cfg=dict(
+                    type=korbenchDataset,
+                    path='opencompass/korbench',
+                    prompt_mode='0_shot',
+                    category=category,
+                    reader_cfg=reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
            ),
-            judge_cfg=dict(),
-            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
-            pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
-        ),
-        pred_role='BOT',
+            parallel=False,
+        )
    )

    # Dataset
@ -113,7 +129,7 @@ for category in categories:
        reader_cfg=reader_cfg,
        infer_cfg=infer_cfg,
        eval_cfg=eval_cfg,
-        mode='singlescore',
+        n=1,
    )

-    korbench_0shot_single_datasets.append(korbench_dataset)
+    korbench_0shot_single_datasets.append(korbench_dataset)
--- a/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
@ -0,0 +1,166 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    n=5,
+    k=3
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]
--- a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py
--- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
@ -0,0 +1,120 @@
+"""
+Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 32
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator,
+)
+
+livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+
+splits = ['hard_cn', 'hard_en']
+# Dataset configuration
+livemathbench_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr=f'livemathbench_hard_custom_{split}',
+        path='data/LiveMathBench',
+        local_mode=True,
+        file_name=f'202412/{split}.jsonl',
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=dict(
+            # Evaluation configuration using LLM as judge
+            evaluator=dict(
+                type=CascadeEvaluator,
+                rule_evaluator=dict(
+                    type=MATHVerifyEvaluator,
+                ),
+                llm_evaluator=dict(
+                    type=GenericLLMEvaluator,
+                    prompt_template=dict(
+                        type=PromptTemplate,
+                        template=dict(
+                            begin=[
+                                dict(
+                                    role='SYSTEM',
+                                    fallback_role='HUMAN',
+                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                                )
+                            ],
+                            round=[
+                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                            ],
+                        ),
+                    ),
+                    dataset_cfg=dict(
+                        type=CustomDataset,
+                        path='data/LiveMathBench',
+                        local_mode=True,
+                        file_name=f'202412/{split}.jsonl',
+                        reader_cfg=livemathbench_reader_cfg,
+                    ),
+                    judge_cfg={},
+                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                ),
+                parallel=False
+            ),
+        ),
+        n=1, # repeat n times
+    ) for split in splits
+]
--- a/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
+++ b/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer

 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
-from opencompass.utils import xml_tag_postprocessor


 GRADER_TEMPLATE = """
@ -97,7 +96,7 @@ livereasonbench_infer_cfg = dict(
            ],
        )),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=16384))
+    inferencer=dict(type=GenInferencer))

 livereasonbench_eval_cfg = dict(
    evaluator=dict(
@ -122,23 +121,22 @@ livereasonbench_eval_cfg = dict(
            type=LiveReasonBenchDataset,
            path='opencompass/LiveReasonBench',
            reader_cfg=livereasonbench_reader_cfg,
+            version='livereasonbench-20250428',
        ),
        judge_cfg=dict(),
        dict_postprocessor=dict(type=livereasonbench_postprocess),
-        pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
    ),
-    pred_role='BOT',
 )

 livereasonbench_datasets = [
    dict(
-        abbr='LiveReasonBench-20241202',
+        abbr='LiveReasonBench-20250428',
        type=LiveReasonBenchDataset,
        path='opencompass/LiveReasonBench',
        reader_cfg=livereasonbench_reader_cfg,
        infer_cfg=livereasonbench_infer_cfg,
        eval_cfg=livereasonbench_eval_cfg,
-        version='livereasonbench-20241202',
-        mode='singlescore',
+        version='livereasonbench-20250428',
+        n=1
  )
 ]
--- a/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py
+++ b/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py
@ -0,0 +1,117 @@
+"""
+Summary: A config for AIME-2024 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator= dict(
+        dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MATHDataset,
+                path='opencompass/math',
+                file_name = 'test_prm800k_500.json',
+                reader_cfg=math_reader_cfg,
+                n=4,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        )
+    ),
+    parallel=False,
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr=f'math_prm800k_500',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=dict(
+            evaluator=cascade_evaluator,
+        ),
+        n=1,
+    )
+]
--- a/opencompass/configs/datasets/math/math_500_gen.py
+++ b/opencompass/configs/datasets/math/math_500_gen.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator

 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')

@ -24,7 +24,7 @@ math_infer_cfg = dict(


 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )

 math_datasets = [
--- a/opencompass/configs/datasets/math/math_gen_a58d9d.py
+++ b/opencompass/configs/datasets/math/math_gen_a58d9d.py
@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset
-from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator

 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')

@ -24,7 +24,7 @@ math_infer_cfg = dict(
    inferencer=dict(type=GenInferencer))

 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHVerifyEvaluator)
 )

 math_datasets = [
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 from opencompass.datasets import (
    MATHDataset,
    math_postprocess_v2,
@ -28,7 +28,7 @@ math_infer_cfg = dict(

 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHVerifyEvaluator)
 )

 math_datasets = [
--- a/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py
+++ b/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py
@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset, MBPPEvaluator
+
+mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp',
+        path='opencompass/mbpp',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+        n=5,
+        k=3
+    )
+]
--- a/opencompass/configs/datasets/mbpp_pro/README.md
+++ b/opencompass/configs/datasets/mbpp_pro/README.md
@ -0,0 +1,17 @@
+# MBPP pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     66   |
+|  qwen2.5-14b-instruct-hf   |     64   |
+|  deepseek-v2-lite-chat-hf  |     36   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     39   |
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mbpp_pro_gen_3dc067 import mbpppro_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg)
+]
--- a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg,
+        n=5,
+        k=3)
+]
--- a/opencompass/configs/datasets/medmcqa/medmcqa_gen.py
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medmcqa_gen_60c8f5 import medmcqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py
@ -0,0 +1,58 @@
+from opencompass.datasets import MedmcqaDataset, MedmcqaEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'subject_name',
+        'choice_type',
+        'prompt_mode',
+        'topic_name',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedmcqaEvaluator),
+    pred_role='BOT',
+)
+medmcqa_dataset = dict(
+    type=MedmcqaDataset,
+    abbr='medmcqa',
+    path='openlifescienceai/medmcqa',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medmcqa_datasets = [medmcqa_dataset]
--- a/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medmcqa_llmjudge_gen_60c8f5 import medmcqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py
@ -0,0 +1,105 @@
+from opencompass.datasets import MedmcqaDataset, medmcqa_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'subject_name',
+        'choice_type',
+        'prompt_mode',
+        'topic_name',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedmcqaDataset,
+            path='openlifescienceai/medmcqa',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=medmcqa_llmjudge_postprocess),
+    ),
+)
+medmcqa_dataset = dict(
+    type=MedmcqaDataset,
+    abbr='medmcqa',
+    path='openlifescienceai/medmcqa',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medmcqa_datasets = [medmcqa_dataset]
--- a/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py
+++ b/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py
@ -0,0 +1,127 @@
+"""
+Setting: 0-shot No-CoT
+Evaluator: GenericLLMEvaluator
+"""
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+)
+
+with read_base():
+    # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets
+    from .mmlu_stem_sets import mmlu_all_sets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    
+    mmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=AccEvaluator,
+                pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'),
+            ),
+            llm_evaluator = dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    abbr=f'lukaemon_mmlu_{name}',
+                    type=MMLUDataset,
+                    path='opencompass/mmlu',
+                    name=name,
+                    reader_cfg=mmlu_reader_cfg,
+                ),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                judge_cfg=dict(),
+            ),
+            parallel=False
+        ),
+    )
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py
@ -0,0 +1,60 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUProDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+categories = [
+    'health',
+]
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{options_str}
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN',
+                         prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
--- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
@ -0,0 +1,101 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess
+
+categories = [
+    'health',
+]
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{options_str}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MMLUProDataset,
+                path='opencompass/mmlu_pro',
+                category=category,
+                reader_cfg=mmlu_pro_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        )
+    )
--- a/opencompass/configs/datasets/multipl_e/multiple_gen.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .multiple_top_ten_gen_f44aaf import multiple_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
@ -32,7 +32,6 @@ multiple_datasets = [
        type=MultiplEDataset,
        abbr=f'humaneval-multiple-{lang}',
        language=lang,
-        num_repeats=1,
        path='opencompass/multipl_e',
        tag='humaneval',
        reader_cfg=multiple_reader_cfg,
@ -46,7 +45,6 @@ multiple_datasets += [
        type=MultiplEDataset,
        abbr=f'mbpp-multiple-{lang}',
        language=lang,
-        num_repeats=1,
        path='opencompass/multipl_e',
        tag='mbpp',
        reader_cfg=multiple_reader_cfg,
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
@ -0,0 +1,58 @@
+# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
+
+
+_TOP_TEN_LANGUAGE_ = ['cpp']
+
+multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
+
+multiple_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+multiple_eval_cfg = {
+    lang: dict(
+        evaluator=dict(
+            type=MultiplEEvaluator,
+            language=lang,
+            ip_address='https://opencompass-multiple-evaluator.hf.space',
+        ),
+        pred_role='BOT',
+    ) for lang in _TOP_TEN_LANGUAGE_
+}
+
+multiple_datasets = [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'humaneval-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='humaneval',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
+
+multiple_datasets += [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'mbpp-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='mbpp',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
--- a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .nejmaibench_gen_60c8f5 import nejmaibench_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py
@ -0,0 +1,59 @@
+from opencompass.datasets import NejmaibenchDataset, NejmaibenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'Subject',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=NejmaibenchEvaluator),
+    pred_role='BOT',
+)
+nejmaibench_dataset = dict(
+    type=NejmaibenchDataset,
+    abbr='nejmaibench',
+    path='opencompass/nejmaibench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+nejmaibench_datasets = [nejmaibench_dataset]
--- a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .nejmaibench_llmjudge_gen_60c8f5 import nejmaibench_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py
@ -0,0 +1,108 @@
+from opencompass.datasets import NejmaibenchDataset
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import GenericLLMEvaluator
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'Subject',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=NejmaibenchDataset,
+            path='opencompass/nejmaibench',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+
+nejmaibench_dataset = dict(
+    type=NejmaibenchDataset,
+    abbr='nejmaibench',
+    path='opencompass/nejmaibench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+nejmaibench_datasets = [nejmaibench_dataset]
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
@ -1,30 +1,46 @@
+"""
+Summary: A config for OmniMath Dataset Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
-from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.omni_math import OmniMathDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator,
+)

-aime2024_reader_cfg = dict(
-    input_columns=['question'], 
+omnimath_reader_cfg = dict(
+    input_columns=['problem'], 
    output_column='answer'
 )

-
-aime2024_infer_cfg = dict(
+omnimath_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
-                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
-            ],
+                dict(role='HUMAN', prompt='please answer the following mathematical question, put your final answer in \\boxed{}.\n\n{problem}'),
+            ]
        )
    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=2048)
+    inferencer=dict(type=GenInferencer)
 )


+
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    
@ -43,16 +59,20 @@ GRADER_TEMPLATE = """
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.


-    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    
    Judging the correctness of candidates' answers:
 """.strip()

-aime2024_eval_cfg = dict(
-    evaluator=dict(
-        type=LMEvaluator,
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator=dict(
+        type=GenericLLMEvaluator,
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
@ -69,19 +89,27 @@ aime2024_eval_cfg = dict(
                ),
            ]),
        ),
-        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        dataset_cfg=dict(
+            type=OmniMathDataset,
+            reader_cfg=omnimath_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),    
    ),
-    pred_role='BOT',
+    parallel=False,
 )

-aime2024_datasets = [
+omnimath_eval_cfg = dict(
+    evaluator=cascade_evaluator,
+)
+
+omnimath_datasets = [
    dict(
-        abbr='aime2024',
-        type=Aime2024Dataset,
-        path='opencompass/aime2024',
-        reader_cfg=aime2024_reader_cfg,
-        infer_cfg=aime2024_infer_cfg,
-        eval_cfg=aime2024_eval_cfg,
-        mode='singlescore',
+        type=OmniMathDataset,
+        abbr='OmniMath',
+        reader_cfg=omnimath_reader_cfg,
+        infer_cfg=omnimath_infer_cfg,
+        eval_cfg=omnimath_eval_cfg,
+        n=1,
    )
 ]
--- a/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py
+++ b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py
@ -0,0 +1,14 @@
+import torch
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='baichuan-m1-14b-base-hf',
+        path='baichuan-inc/Baichuan-M1-14B-Base',
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py
+++ b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py
@ -0,0 +1,14 @@
+import torch
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='baichuan-m1-14b-instruct-hf',
+        path='baichuan-inc/Baichuan-M1-14B-Instruct',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py
@ -0,0 +1,16 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-12b-it-vllm',
+        path='google/gemma-3-12b-it',
+        model_kwargs=dict(tensor_parallel_size=4,
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py
@ -0,0 +1,16 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-27b-it-vllm',
+        path='google/gemma-3-27b-it',
+        model_kwargs=dict(tensor_parallel_size=4,
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )   
+]
--- a/opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py
@ -0,0 +1,17 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-4b-it-vllm',
+        path='google/gemma-3-4b-it',
+        model_kwargs=dict(tensor_parallel_size=2, 
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_seq_len=140000,
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py
@ -0,0 +1,19 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm3-8b-instruct-turbomind',
+        path='internlm/internlm3-8b-instruct',
+        engine_config=dict(session_len=142000, max_batch_size=1, tp=2,
+                           # for long context
+                           rope_scaling_factor=6.0),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192
+        ),
+        max_seq_len=142000,
+        max_out_len=8192,
+        batch_size=1,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py
@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='OREAL-32B',
+        path='internlm/OREAL-32B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
--- a/opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py
@ -0,0 +1,17 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt2-13b-hf',
+        path='FreedomIntelligence/HuatuoGPT2-13B',
+        tokenizer_kwargs=dict(padding_side='left',
+                              truncation_side='left',
+                              trust_remote_code=True,
+                              use_fast=True,),
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py
@ -0,0 +1,13 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt2-7b-hf',
+        path='FreedomIntelligence/HuatuoGPT2-7B',
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py
@ -0,0 +1,15 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt-o1-7b-hf',
+        path='FreedomIntelligence/HuatuoGPT-o1-7B',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
+    )
+]
--- a/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py
@ -0,0 +1,15 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt-o1-8b-hf',
+        path='FreedomIntelligence/HuatuoGPT-o1-8B',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
+    )
+]
--- a/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py
+++ b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internvl2_5-38b-turbomind',
+        path='OpenGVLab/InternVL2_5-38B',
+        engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py
+++ b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internvl2_5-8b-turbomind',
+        path='OpenGVLab/InternVL2_5-8B',
+        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py
+++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py
@ -0,0 +1,21 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2.5-14b-instruct-vllm',
+        path='Qwen/Qwen2.5-14B-Instruct',
+        model_kwargs=dict(
+            tensor_parallel_size=4,
+            rope_scaling={
+                'factor': 4.0,
+                'original_max_position_embeddings': 32768,
+                'rope_type': 'yarn'
+            },
+        ),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/Show More
+++ b/Show More