diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py
index a3045378..56e64c2f 100644
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@@ -116,6 +116,8 @@ with read_base():
 
     from ...volc import infer as volc_infer  # noqa: F401, E501
 
+hf_glm4_9b_model[0]['path'] = 'THUDM/glm-4-9b-hf'
+
 race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py
index f2bc484e..bfe923f6 100644
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@@ -97,8 +97,6 @@ with read_base():
         models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
         models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
         models as \
         lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index ea0e88f6..e317e1d5 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -89,9 +89,6 @@ chat:
     llama-3-8b-instruct-turbomind:
         gsm8k_accuracy: 68.75
         race-high_accuracy: 84.38
-    internvl2_5-8b-turbomind:
-        gsm8k_accuracy: 0
-        race-high_accuracy: 0
     mistral-7b-instruct-v0.2-hf:
         gsm8k_accuracy: 40.62
         race-high_accuracy: 75
@@ -182,15 +179,15 @@ chat:
     yi-1.5-34b-chat-turbomind:
         gsm8k_accuracy: 75.00
         race-high_accuracy: 93.75
+    deepseek-67b-chat-turbomind:
+        gsm8k_accuracy: 75.00
+        race-high_accuracy: 78.12
     deepseek-r1-distill-qwen-32b-turbomind:
         gsm8k_accuracy: 25
         race-high_accuracy: 90.62
     llama-3_3-70b-instruct-turbomind:
         gsm8k_accuracy: 93.75
         race-high_accuracy: 87.5
-    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 59.38
-        race-high_accuracy: 81.25
     mixtral-large-instruct-2411-turbomind:
         gsm8k_accuracy: 87.50
         race-high_accuracy: 93.75
@@ -228,15 +225,10 @@ base:
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 46.88
         winogrande_accuracy: 71.88
-    deepseek-moe-16b-base-hf:
-        gsm8k_accuracy: 21.88
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 21.88
-        winogrande_accuracy: 65.62
     deepseek-7b-base-turbomind:
         gsm8k_accuracy: 21.88
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
         winogrande_accuracy: 84.38
     deepseek-moe-16b-base-vllm:
         gsm8k_accuracy: 21.88
@@ -269,7 +261,7 @@ base:
         race-high_accuracy:
         winogrande_accuracy:
     gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 43.75
         GPQA_diamond_accuracy: 9.38
         race-high_accuracy:
         winogrande_accuracy:
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 40b1c41d..ec0fc644 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -258,7 +258,7 @@ jobs:
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm3_8b_instruct --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
           opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --batch-size 1 --max-out-len 256 --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details