diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py
index ba1902a9..98f0fdf0 100644
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@@ -24,9 +24,9 @@ models = [
         abbr='lmdeploy-api-test',
         type=OpenAISDK,
         key='EMPTY',
-        openai_api_base='http://0.0.0.0:23333/v1',
-        path='internlm2',
-        tokenizer_path='internlm/internlm2_5-7b-chat',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm3',
+        tokenizer_path='internlm/internlm3-8b-instruct',
         rpm_verbose=True,
         meta_template=api_meta_template,
         query_per_second=128,
diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py
index a8dc7a60..4259cc36 100644
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@@ -11,18 +11,10 @@ with read_base():
     from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
         winogrande_datasets  # noqa: F401, E501
     # read hf models - chat models
-    from opencompass.configs.models.chatglm.hf_glm4_9b import \
-        models as hf_glm4_9b_model  # noqa: F401, E501
     from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
         models as lmdeploy_glm4_9b_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
         models as hf_deepseek_7b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
-        models as hf_deepseek_67b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
-        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
-        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
         models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@@ -49,12 +41,6 @@ with read_base():
         models as hf_internlm2_5_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
         models as hf_internlm2_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
-        models as hf_internlm2_20b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
-        models as hf_internlm2_base_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
-        models as hf_internlm2_base_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
         models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@@ -65,14 +51,14 @@ with read_base():
         models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
         models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
+        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama2_7b import \
         models as hf_llama2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
         models as hf_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_8b import \
         models as hf_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
-        models as hf_llama3_70b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
         models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py
index 40ec1bc5..bfe923f6 100644
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@@ -15,14 +15,24 @@ with read_base():
         models as vllm_glm4_9b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
         models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
-        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
-        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
-        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
+        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
         models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
+        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
         models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@@ -45,6 +55,8 @@ with read_base():
         models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
         models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
+        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
         models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@@ -57,6 +69,8 @@ with read_base():
         models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
         models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
+        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
         models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@@ -83,10 +97,6 @@ with read_base():
         models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
         models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
-        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
         models as \
         lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
@@ -95,14 +105,19 @@ with read_base():
     from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
         models as \
         lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
+        models as \
+        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
         models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
         models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
+        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
         models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
-        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.phi.hf_phi_4 import \
+        models as hf_phi_4_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
         models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@@ -142,6 +157,8 @@ with read_base():
 
     from ...volc import infer as volc_infer  # noqa: F401, E501
 
+hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
+
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index 4ef414dc..1cbc5ad2 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -175,10 +175,11 @@ class TestApibench:
 class TestVolcFullbench:
     """Test cases for chat model."""
 
-    @pytest.mark.parametrize(
-        'model, dataset',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
-         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
+        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
+        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
+    ] for p2 in dataset_list(p1, 'objective')])
     @pytest.mark.chat_objective
     def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                             model, dataset):
@@ -245,10 +246,7 @@ class TestCmdCase:
     @pytest.mark.parametrize('model, dataset',
                              [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                               ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
-                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
     def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
@@ -260,9 +258,9 @@ class TestCmdCase:
         [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
          ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
          ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
     def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
@@ -280,13 +278,25 @@ class TestCmdCase:
 
     @pytest.mark.case4
     @pytest.mark.parametrize(
-        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
     def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
-        assert_score(model, result_score, base_score, dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
+
+    @pytest.mark.case5
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
+    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
 
 
 def assert_score(model_type, score, baseline, dataset: str = ''):
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index cd2e3328..e4567553 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -8,20 +8,25 @@ internlm2_5-7b_hf:
     race-middle_accuracy: 91.78
     race-high_accuracy: 90.02
 
-internlm2-1.8b-hf:
-    demo_gsm8k_accuracy: 15.62
-    race-middle_accuracy: 71.66
-    race-high_accuracy: 66.38
-
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 89.06
+    demo_gsm8k_accuracy: 87.50
     race-middle_accuracy: 92.76
     race-high_accuracy: 90.54
 
-internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k_accuracy: 31
-    race-middle_accuracy: 81.34
-    race-high_accuracy: 73.96
+internlm3-8b-instruct-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-vllm:
+    demo_gsm8k_accuracy: 81.25
+    race-middle_accuracy: 92.20
+    race-high_accuracy: 89.88
 
 internlm2_5-7b-chat_hf:
     demo_gsm8k_accuracy: 87.50
@@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
     race-high_accuracy: 90.48
 
 lmdeploy-api-test:
-    gsm8k_accuracy: 68.75
-    race-middle_accuracy: 87.50
+    gsm8k_accuracy: 56.25
+    race-middle_accuracy: 93.75
     race-high_accuracy: 93.75
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 9f171a02..3f5753d3 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
         lcb_test_output_pass@1: 18.75
         bbh-logical_deduction_seven_objects_score: 50
         bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 72.6
-        cmmlu-china-specific_naive_average: 76.25
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 76.25
         mmlu_pro_math_accuracy: 25
         ds1000_Pandas_accuracy: 12.5
         ds1000_Numpy_accuracy: 0
@@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
         college_knowledge_naive_average: 87.5
     subjective:
         alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 20
+        alpaca_eval_total: 0
         arenahard_score: 50
         Followbench_naive_average: 1
         CompassArena_naive_average: 43
         mtbench101_avg: 7.8
-        wildbench_average: -12.78
+        wildbench_average: -15.56
         simpleqa_accuracy_given_attempted: 0
         chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_专业能力: 8.00
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
         alignment_bench_v1_1_文本写作: 0
         alignment_bench_v1_1_角色扮演: 0
         alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 20
+        alpaca_eval_helpful_base: 0
         compassarena_language_naive_average: 35
         compassarena_knowledge_naive_average: 55
         compassarena_reason_v2_naive_average: 40
@@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
 internlm2_5-7b-chat-turbomind_fullbench:
     objective:
         race-high_accuracy:  93.75
-        ARC-c_accuracy: 93.75
+        ARC-c_accuracy: 87.50
         BoolQ_accuracy: 68.75
         triviaqa_wiki_1shot_score: 50
         nq_open_1shot_score: 25
         IFEval_Prompt-level-strict-accuracy: 56.25
-        drop_accuracy: 81.25
+        drop_accuracy: 75
         GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 81.25
-        TheoremQA_score: 6.25
+        hellaswag_accuracy: 87.5
+        TheoremQA_score: 12.5
         musr_average_naive_average: 39.58
-        korbench_single_naive_average: 37.50
-        gsm8k_accuracy: 68.75
-        math_accuracy: 68.75
+        korbench_single_naive_average: 40
+        gsm8k_accuracy: 62.5
+        math_accuracy: 75
         cmo_fib_accuracy: 6.25
         aime2024_accuracy: 6.25
-        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
         sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 16.96
+        ds1000_naive_average: 17.86
         lcb_code_generation_pass@1: 12.5
         lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 25.00
-        bbh-logical_deduction_seven_objects_score: 50.00
-        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 69.71
-        cmmlu-china-specific_naive_average: 75.83
+        lcb_test_output_pass@1: 18.75
+        bbh-logical_deduction_seven_objects_score: 56.25
+        bbh-multistep_arithmetic_two_score: 75
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 78.33
         mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 0
+        ds1000_Pandas_accuracy: 12.5
         ds1000_Numpy_accuracy: 0
         ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 18.75
+        ds1000_Scipy_accuracy: 25
         ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 6.25
         ds1000_Matplotlib_accuracy: 50.00
         openai_mmmlu_lite_AR-XY_accuracy: 37.5
         college_naive_average: 12.50
         college_knowledge_naive_average: 87.5
     subjective:
-        alignment_bench_v1_1_总分: 0.70
+        alignment_bench_v1_1_总分: 0.66
         alpaca_eval_total: 0
         arenahard_score: 50
         Followbench_naive_average: 1
-        CompassArena_naive_average: 38
-        mtbench101_avg: 7.80
-        wildbench_average: -4.86
+        CompassArena_naive_average: 40
+        mtbench101_avg: 8
+        wildbench_average: -6.81
         simpleqa_accuracy_given_attempted: 0
         chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_专业能力: 7.9
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
         alignment_bench_v1_1_综合问答: 0
         alpaca_eval_helpful_base: 0
         compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 50
-        compassarena_reason_v2_naive_average: 30
-        compassarena_math_v2_naive_average: 50
-        compassarena_creationv2_zh_naive_average: 25
+        compassarena_knowledge_naive_average: 45
+        compassarena_reason_v2_naive_average: 25
+        compassarena_math_v2_naive_average: 60
+        compassarena_creationv2_zh_naive_average: 35
         followbench_llmeval_en_HSR_AVG: 1
         followbench_llmeval_en_SSR_AVG: 1
         followbench_llmeval_en_HSR_L1: 1
@@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
         drop_accuracy: 62.5
         GPQA_diamond_accuracy: 62.5
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 25.00
+        TheoremQA_score: 31.25
         winogrande_accuracy: 87.5
-        gsm8k_accuracy: 62.50
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        gsm8k_accuracy: 56.25
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
         GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
         math_accuracy: 18.75
         wikibench-wiki-single_choice_cncircular_perf_4: 25
         sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 31.25
+        dingo_en_192_score: 50.00
         dingo_zh_170_score: 93.75
         mmlu-other_accuracy: 76.92
         cmmlu-china-specific_accuracy: 84.17
         mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 50
+        bbh-logical_deduction_seven_objects_score: 43.75
         bbh-multistep_arithmetic_two_score: 56.25
         college_naive_average: 12.5
         college_knowledge_naive_average: 87.5
@@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
         sanitized_mbpp_score: 55.25
         dingo_en_192_score: 60.94
         dingo_zh_170_score: 67.65
-        mmlu-stem_naive_average: 63.72
-        mmlu-social-science_naive_average: 80.15
-        mmlu-humanities_naive_average: 74.27
-        mmlu-other_naive_average: 71.85
-        cmmlu-stem_naive_average: 67.07
-        cmmlu-social-science_naive_average: 81.49
-        cmmlu-humanities_naive_average: 85.84
-        cmmlu-other_naive_average: 82.69
-        cmmlu-china-specific_naive_average: 79.88
+        mmlu-stem_accuracy: 63.72
+        mmlu-social-science_accuracy: 80.15
+        mmlu-humanities_accuracy: 74.27
+        mmlu-other_accuracy: 71.85
+        cmmlu-stem_accuracy: 67.07
+        cmmlu-social-science_accuracy: 81.49
+        cmmlu-humanities_accuracy: 85.84
+        cmmlu-other_accuracy: 82.69
+        cmmlu-china-specific_accuracy: 79.88
         mmlu_pro_biology_accuracy: 58.58
         mmlu_pro_business_accuracy: 28.01
         mmlu_pro_chemistry_accuracy: 22.79
@@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
         longbench_naive_average: 46.19
         longbench_zh_naive_average: 49.3
         longbench_en_naive_average: 43.97
-        longbench_single-document-qa_naive_average: 42.84
-        longbench_multi-document-qa_naive_average: 37.29
-        longbench_summarization_naive_average: 23.21
-        longbench_few-shot-learning_naive_average: 61.67
-        longbench_synthetic-tasks_naive_average: 60.05
-        longbench_code-completion_naive_average: 52.09
+        longbench_single-document-qa_score: 42.84
+        longbench_multi-document-qa_score: 41.25
+        longbench_summarization_score: 23.21
+        longbench_few-shot-learning_score: 61.67
+        longbench_synthetic-tasks_score: 60.05
+        longbench_code-completion_score: 52.09
 
 internlm2_5-7b-chat-turbomind:
     objective:
@@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
         teval_naive_average: 80
         SciCode_sub_accuracy: 5.56
         qa_dingo_cn_score: 99.01
-        mmlu-stem_naive_average: 68.2
-        mmlu-social-science_naive_average: 75.8
-        mmlu-humanities_naive_average: 69.3
-        mmlu-other_naive_average: 71.3
-        cmmlu-stem_naive_average: 66.64
-        cmmlu-social-science_naive_average: 76
-        cmmlu-humanities_naive_average: 77.9
-        cmmlu-other_naive_average: 77.25
-        cmmlu-china-specific_naive_average: 73.6
+        mmlu-stem_accuracy: 68.2
+        mmlu-social-science_accuracy: 75.8
+        mmlu-humanities_accuracy: 69.3
+        mmlu-other_accuracy: 71.3
+        cmmlu-stem_accuracy: 66.64
+        cmmlu-social-science_accuracy: 76
+        cmmlu-humanities_accuracy: 77.9
+        cmmlu-other_accuracy: 77.25
+        cmmlu-china-specific_accuracy: 73.6
         mmlu_pro_biology_accuracy: 66.67
         mmlu_pro_business_accuracy: 47.91
         mmlu_pro_chemistry_accuracy: 35
@@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind:
         alpaca_eval_koala: 28.21
         alpaca_eval_oasst: 23.4
         alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 25
+        alpaca_eval_vicuna: 33.75
         compassarena_language_naive_average: 52.5
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
@@ -448,9 +448,536 @@ internlm2_5-7b-chat-1m-turbomind:
         babilong_32k_naive_average: 48.9
         babilong_128k_naive_average: 40.8
         babilong_256k_naive_average: 23.5
-        longbench_single-document-qa_naive_average: 43.56
-        longbench_multi-document-qa_naive_average: 46.24
-        longbench_summarization_naive_average: 24.32
-        longbench_few-shot-learning_naive_average: 51.67
-        longbench_synthetic-tasks_naive_average: 66.83
-        longbench_code-completion_naive_average: 45.99
+        longbench_single-document-qa_score: 43.56
+        longbench_multi-document-qa_score: 46.24
+        longbench_summarization_score: 24.32
+        longbench_few-shot-learning_score: 51.67
+        longbench_synthetic-tasks_score: 66.83
+        longbench_code-completion_score: 45.99
+
+
+qwen2.5-7b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 84.99
+        ARC-c_accuracy: 92.2
+        BoolQ_accuracy: 86.7
+        triviaqa_wiki_1shot_score: 53.06
+        nq_open_1shot_score: 17.51
+        mmmlu_lite_naive_average: 54.96
+        IFEval_Prompt-level-strict-accuracy: 71.53
+        drop_accuracy: 80.07
+        bbh_naive_average: 68.81
+        GPQA_diamond_accuracy: 34.34
+        hellaswag_accuracy: 85.42
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.44
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 92.57
+        GaokaoBench_weighted_average: 80.14
+        math_accuracy: 73.58
+        cmo_fib_accuracy: 25
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 77.33
+        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
+        cmmlu_naive_average: 75.97
+        mmlu_naive_average: 76.01
+        mmlu_pro_naive_average: 56.12
+        openai_humaneval_humaneval_pass@1: 83.54
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.29
+        ds1000_naive_average: 18.66
+        lcb_code_generation_pass@1: 39.5
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.68
+        bigcodebench_hard_instruct_pass@1: 16.22
+        bigcodebench_hard_complete_pass@1: 11.49
+        teval_naive_average: 79.72
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 99.01
+        mmlu_accuracy: 76.01
+        mmlu-stem_accuracy: 77.59
+        mmlu-social-science_accuracy: 79.02
+        mmlu-humanities_accuracy: 72.07
+        mmlu-other_accuracy: 74.86
+        cmmlu_accuracy: 75.97
+        cmmlu-stem_accuracy: 73.09
+        cmmlu-social-science_accuracy: 75.95
+        cmmlu-humanities_accuracy: 76.53
+        cmmlu-other_accuracy: 78.79
+        cmmlu-china-specific_accuracy: 73.17
+        mmlu_pro_accuracy: 56.12
+        mmlu_pro_biology_accuracy: 71.41
+        mmlu_pro_business_accuracy: 67.68
+        mmlu_pro_chemistry_accuracy: 54.59
+        mmlu_pro_computer_science_accuracy: 58.29
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 42.41
+        mmlu_pro_health_accuracy: 55.87
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 28.97
+        mmlu_pro_math_accuracy: 73.13
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 58.43
+        mmlu_pro_psychology_accuracy: 63.16
+        mmlu_pro_other_accuracy: 53.57
+        humanevalx-python_pass@1: 50
+        humanevalx-cpp_pass@1: 42.07
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 75
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.18
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 10.43
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 60.65
+        mmmlu_lite_accuracy: 54.96
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.25
+        openai_mmmlu_lite_DE-DE_accuracy: 59.93
+        openai_mmmlu_lite_ES-LA_accuracy: 66.53
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 49.26
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.47
+        openai_mmmlu_lite_JA-JP_accuracy: 61.54
+        openai_mmmlu_lite_KO-KR_accuracy: 60.28
+        openai_mmmlu_lite_PT-BR_accuracy: 55.51
+        openai_mmmlu_lite_SW-KE_accuracy: 36.42
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
+        college_naive_average: 48
+        high_naive_average: 59
+        middle_naive_average: 78
+        primary_naive_average: 85.67
+        arithmetic_naive_average: 75.67
+        mathbench-a (average)_naive_average: 69.27
+        college_knowledge_naive_average: 83.86
+        high_knowledge_naive_average: 80.29
+        middle_knowledge_naive_average: 84.26
+        primary_knowledge_naive_average: 93.16
+        mathbench-t (average)_naive_average: 85.39
+
+
+
+
+internlm2_5-7b-chat-pytorch:
+    objective:
+        race-high_accuracy: 86.39
+        ARC-c_accuracy: 90.51
+        BoolQ_accuracy: 88.01
+        triviaqa_wiki_1shot_score: 64.77
+        nq_open_1shot_score: 22.71
+        mmmlu_lite_naive_average: 45.02
+        IFEval_Prompt-level-strict-accuracy: 56.56
+        drop_accuracy: 75.46
+        bbh_naive_average: 73.34
+        GPQA_diamond_accuracy: 32.83
+        hellaswag_accuracy: 94.81
+        TheoremQA_score: 23.88
+        musr_average_naive_average: 51.31
+        korbench_single_naive_average: 32
+        ARC_Prize_Public_Evaluation_accuracy: 0.01
+        gsm8k_accuracy: 86.96
+        GaokaoBench_weighted_average: 78.05
+        math_accuracy: 60.34
+        cmo_fib_accuracy: 12.98
+        aime2024_accuracy: 3.33
+        Mathbench_naive_average: 64.82
+        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
+        cmmlu_naive_average: 74.24
+        mmlu_naive_average: 70.2
+        mmlu_pro_naive_average: 45.39
+        openai_humaneval_humaneval_pass@1: 70.12
+        sanitized_mbpp_score: 64.59
+        humanevalx_naive_average: 38.78
+        ds1000_naive_average: 14.19
+        lcb_code_generation_pass@1: 16.5
+        lcb_code_execution_pass@1: 33.82
+        lcb_test_output_pass@1: 22.62
+        bigcodebench_hard_instruct_pass@1: 6.08
+        bigcodebench_hard_complete_pass@1: 6.76
+        teval_naive_average: 79.73
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 70.2
+        mmlu-stem_accuracy: 67.73
+        mmlu-social-science_accuracy: 75.49
+        mmlu-humanities_accuracy: 68.56
+        mmlu-other_accuracy: 70.58
+        cmmlu_accuracy: 74.24
+        cmmlu-stem_accuracy: 66.7
+        cmmlu-social-science_accuracy: 75.88
+        cmmlu-humanities_accuracy: 77.56
+        cmmlu-other_accuracy: 77.52
+        cmmlu-china-specific_accuracy: 73.46
+        mmlu_pro_accuracy: 45.39
+        mmlu_pro_biology_accuracy: 65.83
+        mmlu_pro_business_accuracy: 51.96
+        mmlu_pro_chemistry_accuracy: 36.84
+        mmlu_pro_computer_science_accuracy: 48.29
+        mmlu_pro_economics_accuracy: 56.16
+        mmlu_pro_engineering_accuracy: 29.1
+        mmlu_pro_health_accuracy: 44.5
+        mmlu_pro_history_accuracy: 42.26
+        mmlu_pro_law_accuracy: 24.98
+        mmlu_pro_math_accuracy: 54.85
+        mmlu_pro_philosophy_accuracy: 39.28
+        mmlu_pro_physics_accuracy: 37.41
+        mmlu_pro_psychology_accuracy: 58.27
+        mmlu_pro_other_accuracy: 45.78
+        humanevalx-python_pass@1: 56.1
+        humanevalx-cpp_pass@1: 20.73
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 59.15
+        humanevalx-js_pass@1: 57.93
+        ds1000_Pandas_accuracy: 8.93
+        ds1000_Numpy_accuracy: 4.09
+        ds1000_Tensorflow_accuracy: 11.11
+        ds1000_Scipy_accuracy: 7.55
+        ds1000_Sklearn_accuracy: 7.83
+        ds1000_Pytorch_accuracy: 8.82
+        ds1000_Matplotlib_accuracy: 50.97
+        mmmlu_lite_accuracy: 45.02
+        openai_mmmlu_lite_AR-XY_accuracy: 18.6
+        openai_mmmlu_lite_BN-BD_accuracy: 27.58
+        openai_mmmlu_lite_DE-DE_accuracy: 51.23
+        openai_mmmlu_lite_ES-LA_accuracy: 56.63
+        openai_mmmlu_lite_FR-FR_accuracy: 58.11
+        openai_mmmlu_lite_HI-IN_accuracy: 33.82
+        openai_mmmlu_lite_ID-ID_accuracy: 50.39
+        openai_mmmlu_lite_IT-IT_accuracy: 50.39
+        openai_mmmlu_lite_JA-JP_accuracy: 50.95
+        openai_mmmlu_lite_KO-KR_accuracy: 45.05
+        openai_mmmlu_lite_PT-BR_accuracy: 57.89
+        openai_mmmlu_lite_SW-KE_accuracy: 32.14
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
+        college_naive_average: 21
+        high_naive_average: 47
+        middle_naive_average: 59.67
+        primary_naive_average: 76
+        arithmetic_naive_average: 62
+        mathbench-a (average)_naive_average: 53.13
+        college_knowledge_naive_average: 68.99
+        high_knowledge_naive_average: 70.06
+        middle_knowledge_naive_average: 78.53
+        primary_knowledge_naive_average: 88.49
+        mathbench-t (average)_naive_average: 76.51
+
+
+qwen2.5-7b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 85.16
+        ARC-c_accuracy: 90.85
+        BoolQ_accuracy: 86.61
+        triviaqa_wiki_1shot_score: 52.96
+        nq_open_1shot_score: 17.62
+        mmmlu_lite_naive_average: 54.7
+        IFEval_Prompt-level-strict-accuracy: 71.35
+        drop_accuracy: 80.23
+        bbh_naive_average: 68.88
+        GPQA_diamond_accuracy: 36.36
+        hellaswag_accuracy: 85.49
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.3
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 91.66
+        GaokaoBench_weighted_average: 80.02
+        math_accuracy: 73.74
+        cmo_fib_accuracy: 26.44
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 77.08
+        wikibench-wiki-single_choice_cncircular_perf_4: 34
+        cmmlu_naive_average: 75.9
+        mmlu_naive_average: 76.27
+        mmlu_pro_naive_average: 56.14
+        openai_humaneval_humaneval_pass@1: 84.76
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.17
+        ds1000_naive_average: 18.57
+        lcb_code_generation_pass@1: 38.75
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.45
+        bigcodebench_hard_instruct_pass@1: 16.89
+        bigcodebench_hard_complete_pass@1: 12.16
+        teval_naive_average: 79.46
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.27
+        mmlu-stem_accuracy: 77.75
+        mmlu-social-science_accuracy: 78.65
+        mmlu-humanities_accuracy: 73.12
+        mmlu-other_accuracy: 75.05
+        cmmlu_accuracy: 75.9
+        cmmlu-stem_accuracy: 73.41
+        cmmlu-social-science_accuracy: 75.97
+        cmmlu-humanities_accuracy: 76.42
+        cmmlu-other_accuracy: 78.15
+        cmmlu-china-specific_accuracy: 73.27
+        mmlu_pro_accuracy: 56.14
+        mmlu_pro_biology_accuracy: 72.25
+        mmlu_pro_business_accuracy: 66.16
+        mmlu_pro_chemistry_accuracy: 55.65
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 41.38
+        mmlu_pro_health_accuracy: 54.89
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 29.06
+        mmlu_pro_math_accuracy: 73.58
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 60.05
+        mmlu_pro_psychology_accuracy: 61.9
+        mmlu_pro_other_accuracy: 52.6
+        humanevalx-python_pass@1: 51.83
+        humanevalx-cpp_pass@1: 42.68
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 73.78
+        humanevalx-js_pass@1: 72.56
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.64
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 8.7
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 61.29
+        mmmlu_lite_accuracy: 54.7
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.18
+        openai_mmmlu_lite_DE-DE_accuracy: 60
+        openai_mmmlu_lite_ES-LA_accuracy: 66.18
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 48.63
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.26
+        openai_mmmlu_lite_JA-JP_accuracy: 60.7
+        openai_mmmlu_lite_KO-KR_accuracy: 60.63
+        openai_mmmlu_lite_PT-BR_accuracy: 54.46
+        openai_mmmlu_lite_SW-KE_accuracy: 36
+        openai_mmmlu_lite_YO-NG_accuracy: 31.86
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
+        college_naive_average: 48.33
+        high_naive_average: 59.33
+        middle_naive_average: 76.67
+        primary_naive_average: 86.67
+        arithmetic_naive_average: 74.33
+        mathbench-a (average)_naive_average: 69.07
+        college_knowledge_naive_average: 83.54
+        high_knowledge_naive_average: 80.82
+        middle_knowledge_naive_average: 83.79
+        primary_knowledge_naive_average: 92.22
+        mathbench-t (average)_naive_average: 85.1
+
+
+internlm3-8b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 89.22
+        ARC-c_accuracy: 92.54
+        BoolQ_accuracy: 86.45
+        triviaqa_wiki_1shot_score: 60.72
+        nq_open_1shot_score: 20.25
+        mmmlu_lite_naive_average: 41.82
+        IFEval_Prompt-level-strict-accuracy: 77.45
+        drop_accuracy: 83.27
+        bbh_naive_average: 55.22
+        GPQA_diamond_accuracy: 37.88
+        hellaswag_accuracy: 91.28
+        TheoremQA_score: 20.12
+        musr_average_naive_average: 36.86
+        korbench_single_naive_average: 41.2
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 91.28
+        GaokaoBench_weighted_average: 86.59
+        math_accuracy: 76.96
+        cmo_fib_accuracy: 35.1
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 78.96
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
+        cmmlu_naive_average: 83.33
+        mmlu_naive_average: 76.21
+        mmlu_pro_naive_average: 57.96
+        openai_humaneval_humaneval_pass@1: 81.71
+        sanitized_mbpp_score: 69.65
+        humanevalx_naive_average: 40.73
+        ds1000_naive_average: 27.23
+        lcb_code_generation_pass@1: 34.75
+        lcb_code_execution_pass@1: 49.9
+        lcb_test_output_pass@1: 48.19
+        bigcodebench_hard_instruct_pass@1: 13.51
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.21
+        mmlu-stem_accuracy: 77.7
+        mmlu-social-science_accuracy: 80.98
+        mmlu-humanities_accuracy: 70.83
+        mmlu-other_accuracy: 75.01
+        cmmlu_accuracy: 83.33
+        cmmlu-stem_accuracy: 79.66
+        cmmlu-social-science_accuracy: 83.39
+        cmmlu-humanities_accuracy: 84.73
+        cmmlu-other_accuracy: 86.2
+        cmmlu-china-specific_accuracy: 81.77
+        mmlu_pro_accuracy: 57.96
+        mmlu_pro_biology_accuracy: 75.45
+        mmlu_pro_business_accuracy: 64.64
+        mmlu_pro_chemistry_accuracy: 59.81
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 68.6
+        mmlu_pro_engineering_accuracy: 44.79
+        mmlu_pro_health_accuracy: 58.31
+        mmlu_pro_history_accuracy: 49.87
+        mmlu_pro_law_accuracy: 32.43
+        mmlu_pro_math_accuracy: 70.17
+        mmlu_pro_philosophy_accuracy: 46.89
+        mmlu_pro_physics_accuracy: 59.58
+        mmlu_pro_psychology_accuracy: 66.29
+        mmlu_pro_other_accuracy: 54.33
+        humanevalx-python_pass@1: 43.9
+        humanevalx-cpp_pass@1: 20.12
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 65.24
+        ds1000_Pandas_accuracy: 16.49
+        ds1000_Numpy_accuracy: 34.09
+        ds1000_Tensorflow_accuracy: 26.67
+        ds1000_Scipy_accuracy: 17.92
+        ds1000_Sklearn_accuracy: 20.87
+        ds1000_Pytorch_accuracy: 19.12
+        ds1000_Matplotlib_accuracy: 55.48
+        mmmlu_lite_accuracy: 41.82
+        openai_mmmlu_lite_AR-XY_accuracy: 32.56
+        openai_mmmlu_lite_BN-BD_accuracy: 4.56
+        openai_mmmlu_lite_DE-DE_accuracy: 24.91
+        openai_mmmlu_lite_ES-LA_accuracy: 51.09
+        openai_mmmlu_lite_FR-FR_accuracy: 61.68
+        openai_mmmlu_lite_HI-IN_accuracy: 24.98
+        openai_mmmlu_lite_ID-ID_accuracy: 44.56
+        openai_mmmlu_lite_IT-IT_accuracy: 52.35
+        openai_mmmlu_lite_JA-JP_accuracy: 51.02
+        openai_mmmlu_lite_KO-KR_accuracy: 47.93
+        openai_mmmlu_lite_PT-BR_accuracy: 53.89
+        openai_mmmlu_lite_SW-KE_accuracy: 33.47
+        openai_mmmlu_lite_YO-NG_accuracy: 33.47
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
+        college_naive_average: 45.67
+        high_naive_average: 64.67
+        middle_naive_average: 82.33
+        primary_naive_average: 90.33
+        arithmetic_naive_average: 74
+        mathbench-a (average)_naive_average: 71.4
+        college_knowledge_naive_average: 85.28
+        high_knowledge_naive_average: 79.43
+        middle_knowledge_naive_average: 87.9
+        primary_knowledge_naive_average: 93.42
+        mathbench-t (average)_naive_average: 86.51
+
+
+internlm3-8b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 89.02
+        ARC-c_accuracy: 93.56
+        BoolQ_accuracy: 86.67
+        triviaqa_wiki_1shot_score: 60.54
+        nq_open_1shot_score: 20.3
+        mmmlu_lite_naive_average: 42.6
+        IFEval_Prompt-level-strict-accuracy: 79.11
+        drop_accuracy: 83.32
+        bbh_naive_average: 54.76
+        GPQA_diamond_accuracy: 42.42
+        hellaswag_accuracy: 91.31
+        TheoremQA_score: 18
+        musr_average_naive_average: 36.62
+        korbench_single_naive_average: 41.84
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 90.67
+        GaokaoBench_weighted_average: 86.27
+        math_accuracy: 76.68
+        cmo_fib_accuracy: 33.65
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 78.92
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
+        cmmlu_naive_average: 83.11
+        mmlu_naive_average: 76.23
+        mmlu_pro_naive_average: 58.16
+        openai_humaneval_humaneval_pass@1: 82.32
+        sanitized_mbpp_score: 70.04
+        humanevalx_naive_average: 39.76
+        ds1000_naive_average: 27.84
+        lcb_code_generation_pass@1: 34.5
+        lcb_code_execution_pass@1: 48.02
+        lcb_test_output_pass@1: 47.74
+        bigcodebench_hard_instruct_pass@1: 12.84
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.23
+        mmlu-stem_accuracy: 78.08
+        mmlu-social-science_accuracy: 80.31
+        mmlu-humanities_accuracy: 71.38
+        mmlu-other_accuracy: 74.63
+        cmmlu_accuracy: 83.11
+        cmmlu-stem_accuracy: 79.42
+        cmmlu-social-science_accuracy: 83.34
+        cmmlu-humanities_accuracy: 83.95
+        cmmlu-other_accuracy: 86.22
+        cmmlu-china-specific_accuracy: 81.5
+        mmlu_pro_accuracy: 58.16
+        mmlu_pro_biology_accuracy: 74.62
+        mmlu_pro_business_accuracy: 65.02
+        mmlu_pro_chemistry_accuracy: 60.69
+        mmlu_pro_computer_science_accuracy: 61.46
+        mmlu_pro_economics_accuracy: 68.25
+        mmlu_pro_engineering_accuracy: 45.3
+        mmlu_pro_health_accuracy: 60.15
+        mmlu_pro_history_accuracy: 50.66
+        mmlu_pro_law_accuracy: 31.7
+        mmlu_pro_math_accuracy: 70.32
+        mmlu_pro_philosophy_accuracy: 47.7
+        mmlu_pro_physics_accuracy: 59.51
+        mmlu_pro_psychology_accuracy: 65.41
+        mmlu_pro_other_accuracy: 53.46
+        humanevalx-python_pass@1: 42.68
+        humanevalx-cpp_pass@1: 19.51
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 72.56
+        humanevalx-js_pass@1: 64.02
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 35
+        ds1000_Tensorflow_accuracy: 24.44
+        ds1000_Scipy_accuracy: 20.75
+        ds1000_Sklearn_accuracy: 21.74
+        ds1000_Pytorch_accuracy: 22.06
+        ds1000_Matplotlib_accuracy: 56.77
+        mmmlu_lite_accuracy: 42.6
+        openai_mmmlu_lite_AR-XY_accuracy: 32.84
+        openai_mmmlu_lite_BN-BD_accuracy: 10.46
+        openai_mmmlu_lite_DE-DE_accuracy: 24.56
+        openai_mmmlu_lite_ES-LA_accuracy: 50.95
+        openai_mmmlu_lite_FR-FR_accuracy: 61.05
+        openai_mmmlu_lite_HI-IN_accuracy: 30.6
+        openai_mmmlu_lite_ID-ID_accuracy: 45.89
+        openai_mmmlu_lite_IT-IT_accuracy: 51.79
+        openai_mmmlu_lite_JA-JP_accuracy: 51.65
+        openai_mmmlu_lite_KO-KR_accuracy: 48.77
+        openai_mmmlu_lite_PT-BR_accuracy: 52.7
+        openai_mmmlu_lite_SW-KE_accuracy: 32.91
+        openai_mmmlu_lite_YO-NG_accuracy: 32.84
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
+        college_naive_average: 47
+        high_naive_average: 66.67
+        middle_naive_average: 81.67
+        primary_naive_average: 89.33
+        arithmetic_naive_average: 73.67
+        mathbench-a (average)_naive_average: 71.67
+        college_knowledge_naive_average: 82.91
+        high_knowledge_naive_average: 79.86
+        middle_knowledge_naive_average: 88.92
+        primary_knowledge_naive_average: 92.96
+        mathbench-t (average)_naive_average: 86.16
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 45f74131..16a13209 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -1,21 +1,24 @@
 chat:
     glm-4-9b-chat-hf:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
     glm-4-9b-chat-turbomind:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 90.62
     deepseek-7b-chat-hf:
         gsm8k_accuracy: 46.88
         race-high_accuracy: 81.25
-    deepseek-moe-16b-chat-hf:
-        gsm8k_accuracy: 50
-        race-high_accuracy: 68.75
+    deepseek-r1-distill-llama-8b-turbomind:
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 81.25
+    deepseek-r1-distill-qwen-1_5b-turbomind:
+        gsm8k_accuracy: 37.5
+        race-high_accuracy: 53.12
     deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 43.75
         race-high_accuracy: 78.12
     gemma2-2b-it-hf:
         gsm8k_accuracy: 50
@@ -36,34 +39,40 @@ chat:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     gemma-7b-it-vllm:
-        gsm8k_accuracy: 46.88
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 68.75
     internlm2_5-7b-chat-hf:
         gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
+    internlm3-8b-instruct-hf:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 87.5
     internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 87.50
+        gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     internlm2-chat-1.8b-turbomind:
         gsm8k_accuracy: 28.12
         race-high_accuracy: 84.38
     internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 84.38
     internlm2-chat-7b-lmdeploy:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 59.38
         race-high_accuracy: 84.38
     internlm2-chat-7b-sft-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 90.62
+    internlm3-8b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 87.5
     internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 84.38
+        gsm8k_accuracy: 59.38
+        race-high_accuracy: 87.50
     llama-3_1-8b-instruct-hf:
         gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 81.25
     llama-3-8b-instruct-hf:
         gsm8k_accuracy: 68.75
@@ -72,14 +81,14 @@ chat:
         gsm8k_accuracy: 18.75
         race-high_accuracy: 46.88
     llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 81.25
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 81.25
     llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 71.88
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 84.38
     mistral-7b-instruct-v0.2-hf:
         gsm8k_accuracy: 40.62
         race-high_accuracy: 75
@@ -94,13 +103,10 @@ chat:
         race-high_accuracy: 78.12
     mistral-7b-instruct-v0.1-vllm:
         gsm8k_accuracy: 34.38
-        race-high_accuracy: 68.75
+        race-high_accuracy: 65.62
     mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 31.25
-        race-high_accuracy: 75
-    phi-3-mini-4k-instruct-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 21.88
+        race-high_accuracy: 78.12
     qwen2.5-0.5b-instruct-hf:
         gsm8k_accuracy: 34.38
         race-high_accuracy: 46.88
@@ -108,10 +114,10 @@ chat:
         gsm8k_accuracy: 53.12
         race-high_accuracy: 90.62
     qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 28.12
-        race-high_accuracy: 50
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 43.75
     qwen2.5-3b-instruct-turbomind:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 90.62
     qwen1.5-0.5b-chat-hf:
         gsm8k_accuracy: 0
@@ -123,11 +129,11 @@ chat:
         gsm8k_accuracy: 68.75
         race-high_accuracy: 90.62
     qwen2-1.5b-instruct-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 84.38
     qwen2-7b-instruct-turbomind:
         gsm8k_accuracy: 81.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
     qwen1.5-0.5b-chat-vllm:
         gsm8k_accuracy: 3.12
         race-high_accuracy: 53.12
@@ -143,11 +149,11 @@ chat:
     yi-1.5-9b-chat-turbomind:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 93.75
-    deepseek-v2-lite-chat-hf:
-        gsm8k_accuracy: 46.88
+    deepseek-v2_lite-chat-turbomind:
+        gsm8k_accuracy: 37.5
         race-high_accuracy: 71.88
     gemma2-27b-it-hf:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 93.75
     internlm2_5-20b-chat-hf:
         gsm8k_accuracy: 84.38
@@ -161,6 +167,9 @@ chat:
     mistral-small-instruct-2409-turbomind:
         gsm8k_accuracy: 81.25
         race-high_accuracy: 87.50
+    phi-4:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
     qwen2.5-14b-instruct-hf:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 96.88
@@ -168,40 +177,41 @@ chat:
         gsm8k_accuracy: 68.75
         race-high_accuracy: 93.75
     yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 93.75
-    deepseek-67b-chat-hf:
-        gsm8k_accuracy: 71.88
+    deepseek-67b-chat-turbomind:
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 78.12
+    deepseek-r1-distill-qwen-32b-turbomind:
+        gsm8k_accuracy: 25
+        race-high_accuracy: 90.62
     llama-3_3-70b-instruct-turbomind:
         gsm8k_accuracy: 93.75
         race-high_accuracy: 87.5
-    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 59.38
-        race-high_accuracy: 81.25
     mixtral-large-instruct-2411-turbomind:
-        gsm8k_accuracy: 90.62
+        gsm8k_accuracy: 87.50
         race-high_accuracy: 93.75
     nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 87.5
-        race-high_accuracy: 46.88
+        gsm8k_accuracy: 93.75
+        race-high_accuracy: 50.00
     qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 90.62
+    deepseek-r1-distill-llama-70b-turbomind:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 90.62
     deepseek-v2_5-1210-turbomind:
         gsm8k_accuracy: 90.62
         race-high_accuracy: 84.38
-    mixtral-8x22b-instruct-v0.1-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 81.25
+    mixtral-8x22b-instruct-v0.1-turbomind:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 78.12
+    mixtral-8x22b-instruct-v0.1-vllm:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
 base:
-    glm-4-9b-hf:
-        gsm8k_accuracy: 68.75
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
     glm-4-9b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 56.25
         GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
@@ -210,15 +220,10 @@ base:
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 46.88
         winogrande_accuracy: 71.88
-    deepseek-moe-16b-base-hf:
-        gsm8k_accuracy: 21.88
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 21.88
-        winogrande_accuracy: 65.62
     deepseek-7b-base-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 18.75
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
         winogrande_accuracy: 84.38
     deepseek-moe-16b-base-vllm:
         gsm8k_accuracy: 21.88
@@ -245,16 +250,21 @@ base:
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 65.62
         winogrande_accuracy: 71.88
+    gemma-2-9b-turbomind:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 50
     gemma-2b-vllm:
         gsm8k_accuracy: 15.62
         GPQA_diamond_accuracy: 3.12
-        race-high_accuracy:
-        winogrande_accuracy:
+        race-high_accuracy: 28.12
+        winogrande_accuracy: 68.75
     gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy:
-        winogrande_accuracy:
+        gsm8k_accuracy: 43.75
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 81.25
     internlm2_5-7b-hf:
         gsm8k_accuracy: 37.5
         GPQA_diamond_accuracy: 25
@@ -265,30 +275,25 @@ base:
         GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 62.5
         winogrande_accuracy: 78.12
-    internlm2-base-7b-hf:
-        gsm8k_accuracy: 3.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 75
-        winogrande_accuracy: 65.62
     internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 6.25
+        GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 71.88
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 75
     internlm2_5-7b-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 34.38
         race-high_accuracy: 93.75
-        winogrande_accuracy: 87.50
+        winogrande_accuracy: 84.38
     internlm2-7b-turbomind:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 21.88
+        gsm8k_accuracy: 50
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 71.88
         winogrande_accuracy: 84.38
     internlm2-base-7b-turbomind:
         gsm8k_accuracy: 37.50
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 84.38
         winogrande_accuracy: 75
     llama-2-7b-hf:
         gsm8k_accuracy: 21.88
@@ -311,7 +316,7 @@ base:
         race-high_accuracy: 78.12
         winogrande_accuracy: 78.12
     llama-3-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 46.88
         GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 65.62
         winogrande_accuracy: 78.12
@@ -327,14 +332,14 @@ base:
         winogrande_accuracy: 71.88
     qwen2.5-1.5b-turbomind:
         gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 12.50
-        race-high_accuracy: 78.12
-        winogrande_accuracy: 68.75
-    qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 75.00
-        GPQA_diamond_accuracy: 25
-        race-high_accuracy: 87.5
+        GPQA_diamond_accuracy: 15.62
+        race-high_accuracy: 75
         winogrande_accuracy: 71.88
+    qwen2.5-7b-turbomind:
+        gsm8k_accuracy: 71.88
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 75.00
     qwen1.5-moe-a2.7b-hf:
         gsm8k_accuracy: 62.5
         GPQA_diamond_accuracy: 18.75
@@ -356,17 +361,17 @@ base:
         race-high_accuracy: 87.5
         winogrande_accuracy: 68.75
     qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 81.25
         winogrande_accuracy: 75
     qwen2-7b-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 65.62
         GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 87.5
         winogrande_accuracy: 71.88
     qwen1.5-0.5b-vllm:
-        gsm8k_accuracy: 9.38
+        gsm8k_accuracy: 6.25
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 56.25
         winogrande_accuracy: 62.5
@@ -382,27 +387,12 @@ base:
         winogrande_accuracy: 59.38
     yi-1.5-9b-turbomind:
         gsm8k_accuracy: 78.12
-        GPQA_diamond_accuracy: 40.62
+        GPQA_diamond_accuracy: 43.75
         race-high_accuracy: 87.5
         winogrande_accuracy: 71.88
-    deepseek-v2-lite-hf:
-        gsm8k_accuracy: 31.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 59.38
-        winogrande_accuracy: 71.88
-    internlm2-20b-hf:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 68.75
-        winogrande_accuracy: 75
-    internlm2-base-20b-hf:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy: 84.38
-        winogrande_accuracy: 65.62
     internlm2-20b-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 15.62
+        gsm8k_accuracy: 75
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 68.75
         winogrande_accuracy: 81.25
     qwen2.5-14b-hf:
@@ -416,37 +406,27 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 78.12
     qwen2.5-32b-turbomind:
-        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 28.12
+        gsm8k_accuracy: 87.5
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
-    deepseek-67b-base-hf:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 90.62
     deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 53.12
         GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 81.25
         winogrande_accuracy: 84.38
     llama-3-70b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
     qwen2.5-72b-turbomind:
         gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 34.38
+        GPQA_diamond_accuracy: 31.25
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     deepseek-v2-turbomind:
-        gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
-    llama-3-70b-hf:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 3cdb3a73..6a1c2ebc 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -61,6 +61,7 @@ env:
   HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
   HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
   CONDA_ENV: regression_test
+  export VLLM_WORKER_MULTIPROC_METHOD: spawn
 
 jobs:
   build-pypi:
@@ -92,7 +93,6 @@ jobs:
       matrix:
         pyver: [py310]
     runs-on: ubuntu-latest
-    environment: 'prod'
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
@@ -126,7 +126,6 @@ jobs:
     if: ${{!cancelled()}}
     needs: ['build-pypi', 'build-pypi-lmdeploy']
     runs-on: volc_cu12
-    environment: 'prod'
     timeout-minutes: 120 #2hours
     steps:
       - name: Clone repository
@@ -157,7 +156,9 @@ jobs:
             pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
             FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
             pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
             cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
@@ -188,7 +189,6 @@ jobs:
       matrix:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
     runs-on: volc_cu12_daily
-    environment: 'prod'
     timeout-minutes: 180 #3hours
     steps:
       - name: Clone repository
@@ -229,7 +229,6 @@ jobs:
       matrix:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
     runs-on: volc_cu12_local
-    environment: 'prod'
     timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
@@ -256,27 +255,33 @@ jobs:
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
           python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
           opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
           python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
           python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
+          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run model test - api
         if: matrix.regression_func == 'api'
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           conda info --envs
-          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 180s
+          env | grep PROXY
+          env | grep proxy
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
           opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
           python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@@ -305,7 +310,6 @@ jobs:
       matrix:
         function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
     runs-on: volc_cu12
-    environment: 'prod'
     timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
@@ -339,7 +343,6 @@ jobs:
     needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
     timeout-minutes: 5
     runs-on: self-hosted
-    environment: 'prod'
     steps:
       - name: notify
         run: |
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
index 032c4bc0..45fbd634 100644
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@@ -45,7 +45,7 @@ jobs:
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
           conda info --envs
       - name: conda env
         run: |
diff --git a/README.md b/README.md
index f3c6028a..a17a1998 100644
--- a/README.md
+++ b/README.md
@@ -176,69 +176,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
 
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
 
-- Your first evaluation with OpenCompass!
+### Your first evaluation with OpenCompass!
 
-  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
+OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
 
-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
 
-  # Python scripts
-  opencompass examples/eval_chat_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_chat_demo.py
+```
 
-  You can find more script examples under [examples](./examples) folder.
+You can find more script examples under [examples](./examples) folder.
 
-- API evaluation
+### API evaluation
 
-  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
 
-  ```bash
-  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
-  # CLI
-  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+```bash
+export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+# CLI
+opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
 
-  # Python scripts
-  opencompass examples/eval_api_demo.py
+# Python scripts
+opencompass examples/eval_api_demo.py
 
-  # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
-  ```
+# You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
+```
 
-- Accelerated Evaluation
+### Accelerated Evaluation
 
-  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
 
-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
 
-  # Python scripts
-  opencompass examples/eval_lmdeploy_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_lmdeploy_demo.py
+```
 
-- Supported Models
+### Supported Models and Datasets
 
-  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
 
-  ```bash
-  # List all configurations
-  python tools/list_configs.py
-  # List all configurations related to llama and mmlu
-  python tools/list_configs.py llama mmlu
-  ```
+```bash
+# List all configurations
+python tools/list_configs.py
+# List all configurations related to llama and mmlu
+python tools/list_configs.py llama mmlu
+```
 
-  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+#### Supported Models
 
-  ```bash
-  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
-  ```
+If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
 
-  If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+```bash
+opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
+```
 
-  ```bash
-  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
-  ```
+#### Supported Datasets
+
+Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
+
+```bash
+# Recommended Evaluation Config based on Rules
+opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+# Recommended Evaluation Config based on LLM Judge
+opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+```
+
+If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
+```
 
 > \[!TIP\]
 >
@@ -288,7 +302,7 @@ You can quickly find the dataset you need from the list through sorting, filteri
 
 In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
 
-Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
 
 <p align="right"><a href="#top">🔝Back to top</a></p>
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index a4ef743f..4406c7bc 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -208,9 +208,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
   ```
 
-  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+- ### 支持的模型与数据集
 
-- ### 支持的模型
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
 
   ```bash
   # 列出所有配置
@@ -219,13 +219,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   python tools/list_configs.py llama mmlu
   ```
 
-  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+  #### 支持的模型
+
+  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
 
   ```bash
   opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
   ```
 
-  如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
+  #### 支持的数据集
+
+  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
+
+  ```bash
+  # 基于规则的推荐配置
+  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+  # 基于LLM Judge的推荐配置
+  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+  ```
+
+  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
 
   ```bash
   CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@@ -281,9 +295,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 
 您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 
-另外，我们为每个数据集都提供了一种推荐配置，部分数据集还支持了基于LLM Judge的配置。
-
-详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
 
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 
diff --git a/dataset-index.yml b/dataset-index.yml
index 59b6d4da..de5e316e 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -121,7 +121,7 @@
     category: Reasoning
     paper: https://arxiv.org/pdf/2310.16049
     configpath: opencompass/configs/datasets/musr/musr_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
 - needlebench:
     name: NeedleBench
     category: Long Context
@@ -715,6 +715,12 @@
     paper: https://arxiv.org/pdf/1809.02789v1
     configpath: opencompass/configs/datasets/obqa/obqa_gen.py
     configpath_llmjudge: ''
+- olymmath:
+    name: OlymMATH
+    category: Math
+    paper: https://arxiv.org/abs/2503.21380
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
 - piqa:
     name: OpenBookQA
     category: Knowledge / Physics
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 9101ba3f..150d5ca7 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -117,6 +117,10 @@ html_js_files = [
     'js/custom.js'
 ]
 
+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
diff --git a/docs/en/statis.py b/docs/en/statis.py
index 483ebf78..daabe818 100755
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@@ -32,12 +32,23 @@ with open(load_path, 'r') as f2:
 
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr'
+]
+
 
 def table_format(data_list):
     table_format_list = []
     for i in data_list:
         table_format_list_sub = []
         for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[link]('
+            else:
+                link_token = '[link(TBD)]('
+
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[link](' + i[j][index] + ')')
@@ -45,18 +56,18 @@ def table_format(data_list):
                     if i[j][index] == '':
                         table_format_list_sub.append(i[j][index])
                     else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                      GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
                         for k in i[j][index]:
-                            sub_list_text += ('[link](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                               ') / ')
                         table_format_list_sub.append(sub_list_text[:-2])
                     else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                      GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 else:
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 8910ead0..2a5e3f59 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -117,6 +117,10 @@ html_js_files = [
     'js/custom.js'
 ]
 
+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py
index 19d03bfd..04134cf6 100755
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@@ -30,12 +30,23 @@ with open(load_path, 'r') as f2:
 
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr'
+]
+
 
 def table_format(data_list):
     table_format_list = []
     for i in data_list:
         table_format_list_sub = []
         for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[链接]('
+            else:
+                link_token = '[链接(TBD)]('
+
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[链接](' + i[j][index] + ')')
@@ -43,17 +54,19 @@ def table_format(data_list):
                     if i[j][index] == '':
                         table_format_list_sub.append(i[j][index])
                     else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
                         for k in i[j][index]:
-                            sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                               ') / ')
                         table_format_list_sub.append(sub_list_text[:-2])
                     else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 else:
                     table_format_list_sub.append(i[j][index])
diff --git a/opencompass/__init__.py b/opencompass/__init__.py
index f0ede3d3..a9873473 100644
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
@@ -1 +1 @@
-__version__ = '0.4.1'
+__version__ = '0.4.2'
diff --git a/opencompass/configs/datasets/OlymMATH/README.md b/opencompass/configs/datasets/OlymMATH/README.md
new file mode 100644
index 00000000..53c9b7a0
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/README.md
@@ -0,0 +1,60 @@
+# OlymMATH
+[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
+
+Dataset OlymMATH, please refer to the paper:
+Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
+
+
+## How to eval OlymMATH with model judge
+This is a simple example:
+```python
+
+from opencompass.models import OpenAISDK, OpenAI
+from mmengine.config import read_base
+
+
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
+    from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
+
+##################  Judge Config  ##################
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+judge_cfg = dict(
+    # An API model with OpenAI API format is required for Judge
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+)
+
+##################  Model Config  ##################
+models = [*qwen2_5_7b_instruct_model]
+
+##################  Dataset Config  ##################
+datasets = [*olymmath_datasets]
+
+# Set judge_cfg for evaluation
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+
+work_dir = './outputs/olymmath_llm_eval'
+```
diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
new file mode 100644
index 00000000..dfc80538
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
@@ -0,0 +1,5 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Default use LLM as a judge
+    from .olymmath_llmverify_gen_97b203 import olymmath_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
new file mode 100644
index 00000000..0c517a13
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
@@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import OlymMATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+
+olymmath_datasets = []
+
+for sub_set in sub_sets:
+    math_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlymMATHDataset,
+                path='RUC-AIBOX/OlymMATH',
+                reader_cfg=math_reader_cfg,
+                subset=sub_set,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olymmath_datasets.append(
+        dict(
+            type=OlymMATHDataset,
+            abbr=f'olymmath_llmjudge_{sub_set}',
+            path='RUC-AIBOX/OlymMATH',
+            reader_cfg=math_reader_cfg,
+            infer_cfg=math_infer_cfg,
+            eval_cfg=math_eval_cfg,
+            subset=sub_set,
+        )
+    )
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
index d49a1ccc..0faf8630 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
@@ -1,15 +1,14 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
     MATHDataset,
+    MATHEvaluator,
     math_postprocess_v2,
     normalize_final_answer,
 )
 
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
-
 math_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
@@ -28,7 +27,8 @@ math_infer_cfg = dict(
 
 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
 )
 
 math_datasets = [
@@ -41,4 +41,4 @@ math_datasets = [
         infer_cfg=math_infer_cfg,
         eval_cfg=math_eval_cfg,
     )
-]
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
new file mode 100644
index 00000000..d49a1ccc
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
@@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500',
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
index c74231fc..759b1b63 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .math_prm800k_500_0shot_cot_gen import math_datasets  # noqa: F401, F403
\ No newline at end of file
+    from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
new file mode 100644
index 00000000..1ffef256
--- /dev/null
+++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
@@ -0,0 +1,22 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='mixtral-8x22b-instruct-v0.1-turbomind',
+        path='mistralai/Mixtral-8x22B-Instruct-v0.1',
+        engine_config=dict(
+            session_len=32768,
+            max_batch_size=16,
+            tp=8,
+            cache_max_entry_count=0.7,
+        ),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/opencompass/datasets/TheoremQA/number_utils.py b/opencompass/datasets/TheoremQA/number_utils.py
index 12f6e6dc..fd93fe66 100644
--- a/opencompass/datasets/TheoremQA/number_utils.py
+++ b/opencompass/datasets/TheoremQA/number_utils.py
@@ -48,7 +48,7 @@ def clean_units(pred_str: str):
 
 
 def number_it(num):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
     if isinstance(num, (int, float)):
         return num
 
diff --git a/opencompass/datasets/TheoremQA/utils.py b/opencompass/datasets/TheoremQA/utils.py
index a4f32b2b..ca9c2661 100644
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@@ -17,7 +17,7 @@ def time_limit(seconds: float):
 
 
 def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
 
     if any([option in pred.lower() for option in ['yes', 'true']]):
         pred = 'True'
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 49cd1522..45209054 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -106,6 +106,7 @@ from .natural_question import *  # noqa: F401, F403
 from .natural_question_cn import *  # noqa: F401, F403
 from .NPHardEval import *  # noqa: F401, F403
 from .obqa import *  # noqa: F401, F403
+from .olymmath import *  # noqa: F401, F403
 from .OlympiadBench import *  # noqa: F401, F403
 from .OpenFinData import *  # noqa: F401, F403
 from .piqa import *  # noqa: F401, F403
diff --git a/opencompass/datasets/olymmath.py b/opencompass/datasets/olymmath.py
new file mode 100644
index 00000000..e9f8af40
--- /dev/null
+++ b/opencompass/datasets/olymmath.py
@@ -0,0 +1,14 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class OlymMATHDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, subset: str):
+        dataset = load_dataset(path, subset)
+        return dataset
diff --git a/opencompass/models/claude_sdk_api.py b/opencompass/models/claude_sdk_api.py
index 8cbf98ef..173047f1 100644
--- a/opencompass/models/claude_sdk_api.py
+++ b/opencompass/models/claude_sdk_api.py
@@ -33,6 +33,7 @@ class ClaudeSDK(BaseAPIModel):
         max_seq_len: int = 2048,
         meta_template: Optional[Dict] = None,
         temperature: Optional[float] = 0.0,
+        thinking: Optional[Dict] = None,
         retry: int = 2,
     ):
         super().__init__(path=path,
@@ -49,6 +50,7 @@ class ClaudeSDK(BaseAPIModel):
         self.anthropic = Anthropic(api_key=key)
         self.model = path
         self.temperature = temperature
+        self.thinking = thinking
 
     def generate(
         self,
@@ -108,11 +110,26 @@ class ClaudeSDK(BaseAPIModel):
         while num_retries < self.retry:
             self.wait()
             try:
-                responses = self.anthropic.messages.create(
-                    model=self.model,
-                    max_tokens=max_out_len,
-                    temperature=self.temperature,
-                    messages=messages)
+                api_params = {
+                    'model': self.model,
+                    'max_tokens': max_out_len,
+                    'temperature': self.temperature,
+                    'messages': messages,
+                }
+
+                if self.thinking is not None:
+                    api_params['thinking'] = self.thinking
+                    api_params['stream'] = True
+
+                responses = self.anthropic.messages.create(**api_params)
+
+                # Handle new response format
+                for content in responses.content:
+                    if content.type == 'text':
+                        return content.text
+
+                # If no text type content is found, return the first
+                # content (backward compatibility)
                 return responses.content[0].text
             except Exception as e:
                 self.logger.error(e)
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index f46de71c..7b2c2c53 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -652,7 +652,6 @@ class OpenAISDK(OpenAI):
                     self.logger.info('Start calling OpenAI API')
                 responses = self.openai_client.chat.completions.create(
                     **query_data, timeout=timeout)  # timeout in seconds
-
                 if self.verbose:
                     self.logger.info(
                         'Successfully get response from OpenAI API')
@@ -660,10 +659,18 @@ class OpenAISDK(OpenAI):
                         self.logger.info(responses)
                     except Exception:
                         pass  # noqa F841
-                if not responses.choices:
+
+                # Check if response is empty or content is empty
+                if not responses.choices or not responses.choices[
+                        0].message.content:
                     self.logger.error(
-                        'Response is empty, it is an internal server error \
-                            from the API provider.')
+                        'API response is empty, it might be due to excessive '
+                        'input length or an internal server error '
+                        'from your API provider.')
+                    num_retries += 1
+                    # Continue to retry instead of returning empty response
+                    continue
+
                 return responses.choices[0].message.content
 
             except (BadRequestError, APIStatusError) as e:
diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py
index ccb8d139..de917f44 100644
--- a/opencompass/summarizers/subjective/common_summarizer.py
+++ b/opencompass/summarizers/subjective/common_summarizer.py
@@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
             f.write(','.join(new_header) + '\n')
             for line in new_table:
                 f.write(','.join(map(str, line)) + '\n')
-            print(t)
             print(output_file)
         return {'qa_bench_' + show_dataset_abbr:json_result}
diff --git a/requirements/extra.txt b/requirements/extra.txt
index fd3f7a2f..fa90a34c 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -11,12 +11,10 @@ faiss_gpu==1.7.2
 -e git+https://github.com/open-compass/human-eval.git#egg=human-eval
 # IFEval
 langdetect
-# TheoremQA
-latex2sympy2==1.9.1
 # Lawbench, leval
 ltp
 # Math
-math-verify
+math-verify[antlr4_11_0]
 # Taco, apps Dataset
 pyext
 # Law Bench