From c48bbde26fc98a0876a3450fe8655f6a0e7f2faf Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 25 Dec 2024 17:26:50 +0800
Subject: [PATCH] [ci] remove testcase into volc engine (#1777)

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
---
 .../scripts/eval_regression_base_fullbench.py |   2 +
 ...base.py => eval_regression_base_models.py} |  26 +
 ...chat.py => eval_regression_chat_models.py} |  45 +-
 ... => eval_regression_chat_obj_fullbench.py} |   2 +
 ... => eval_regression_chat_sub_fullbench.py} |  29 +-
 .github/scripts/oc_score_assert.py            | 382 ++------
 .github/scripts/oc_score_baseline.yaml        |   4 +-
 .../scripts/oc_score_baseline_fullbench.yaml  | 877 +++++++++--------
 .../scripts/oc_score_baseline_testrange.yaml  | 927 +++++++++---------
 .github/workflows/daily-run-test.yml          | 302 +++---
 .github/workflows/pr-run-test.yml             |  52 +-
 11 files changed, 1226 insertions(+), 1422 deletions(-)
 rename .github/scripts/{eval_regression_base.py => eval_regression_base_models.py} (80%)
 rename .github/scripts/{eval_regression_chat.py => eval_regression_chat_models.py} (76%)
 rename .github/scripts/{eval_regression_chat_objective_fullbench.py => eval_regression_chat_obj_fullbench.py} (99%)
 rename .github/scripts/{eval_regression_chat_subjective_fullbench.py => eval_regression_chat_sub_fullbench.py} (86%)

diff --git a/.github/scripts/eval_regression_base_fullbench.py b/.github/scripts/eval_regression_base_fullbench.py
index 23a8505b..028a41b6 100644
--- a/.github/scripts/eval_regression_base_fullbench.py
+++ b/.github/scripts/eval_regression_base_fullbench.py
@@ -66,6 +66,8 @@ with read_base():
     from opencompass.configs.summarizers.groups.mmlu_pro import \
         mmlu_pro_summary_groups  # noqa: F401, E501
 
+    from ...volc import infer as volc_infer  # noqa: F401, E501
+
 race_datasets = [race_datasets[1]]  # Only take RACE-High
 humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
 bbh_datasets = [
diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base_models.py
similarity index 80%
rename from .github/scripts/eval_regression_base.py
rename to .github/scripts/eval_regression_base_models.py
index 330c97e5..08413707 100644
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base_models.py
@@ -13,12 +13,22 @@ with read_base():
     # read hf models - chat models
     from opencompass.configs.models.chatglm.hf_glm4_9b import \
         models as hf_glm4_9b_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
+        models as lmdeploy_glm4_9b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
+        models as hf_deepseek_7b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
+        models as hf_deepseek_67b_base_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
         models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
         models as hf_deepseek_v2_lite_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
         models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
+        models as lmdeploy_deepseek_67b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
+        lmdeploy_deepseek_v2_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
         models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma2_2b import \
@@ -29,6 +39,8 @@ with read_base():
         models as hf_gemma_2b_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma_7b import \
         models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
+        models as lmdeploy_gemma_9b_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.vllm_gemma_2b import \
         models as vllm_gemma_2b_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.vllm_gemma_7b import \
@@ -59,10 +71,14 @@ with read_base():
         models as hf_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_8b import \
         models as hf_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
+        models as hf_llama3_70b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
         models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
         models as lmdeploy_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
+        models as lmdeploy_llama3_70b_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
         models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
@@ -73,10 +89,16 @@ with read_base():
         models as hf_qwen_2_5_7b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
         models as hf_qwen_2_5_14b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
+        models as hf_qwen_2_5_32b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
         models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
         models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
+        models as lmdeploy_qwen2_5_32b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
+        models as lmdeploy_qwen2_5_72b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
         models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
@@ -95,6 +117,10 @@ with read_base():
         models as hf_yi_1_5_6b_model  # noqa: F401, E501
     from opencompass.configs.models.yi.hf_yi_1_5_9b import \
         models as hf_yi_1_5_9b_model  # noqa: F401, E501
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
+        models as lmdeploy_yi_1_5_9b_model  # noqa: F401, E501
+
+    from ...volc import infer as volc_infer  # noqa: F401, E501
 
 race_datasets = [race_datasets[1]]
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat_models.py
similarity index 76%
rename from .github/scripts/eval_regression_chat.py
rename to .github/scripts/eval_regression_chat_models.py
index 7762e4f7..eeade13f 100644
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat_models.py
@@ -7,8 +7,6 @@ with read_base():
     from opencompass.configs.datasets.race.race_gen import \
         race_datasets  # noqa: F401, E501
     # read hf models - chat models
-    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
-        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
         models as hf_glm4_9b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
@@ -17,22 +15,30 @@ with read_base():
         models as vllm_glm4_9b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
         models as hf_deepseek_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
+        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
         models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
         models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
+        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
         models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
         models as hf_gemma2_2b_it_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
         models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
+        models as hf_gemma2_27b_it_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma_2b_it import \
         models as hf_gemma_2b_it_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma_7b_it import \
         models as hf_gemma_7b_it_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
         models as lmdeploy_gemma_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
+        models as lmdeploy_gemma_27b_it_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
         models as vllm_gemma_7b_it_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
@@ -65,6 +71,8 @@ with read_base():
         models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
         models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
+        models as lmdeploy_llama3_3_70b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
         models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
@@ -75,6 +83,13 @@ with read_base():
         models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
         models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
+        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
+        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
+        models as \
+        lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
         models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
@@ -84,22 +99,28 @@ with read_base():
         models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
         models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
+        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
     from opencompass.configs.models.openbmb.hf_minicpm3_4b import \
         models as hf_minicpm3_4b_model  # noqa: F401, E501
-    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
-        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
-    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
-        models as hf_minicpm_2b_sft_bf16_model  # noqa: F401, E501
-    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
-        models as hf_minicpm_2b_sft_fp32_model  # noqa: F401, E501
     from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
         models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
         models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
+        models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
+        models as hf_qwen2_5_3b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
         models as hf_qwen2_5_14b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
+        models as lmdeploy_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
+        models as lmdeploy_qwen2_5_3b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
         models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
+        models as lmdeploy_qwen2_5_72b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
         models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
@@ -116,6 +137,14 @@ with read_base():
         models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
         models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
+        models as lmdeploy_yi_1_5_6b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
+        models as lmdeploy_yi_1_5_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
+        models as lmdeploy_yi_1_5_34b_chat_model  # noqa: F401, E501
+
+    from ...volc import infer as volc_infer  # noqa: F401, E501
 
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
diff --git a/.github/scripts/eval_regression_chat_objective_fullbench.py b/.github/scripts/eval_regression_chat_obj_fullbench.py
similarity index 99%
rename from .github/scripts/eval_regression_chat_objective_fullbench.py
rename to .github/scripts/eval_regression_chat_obj_fullbench.py
index f02fb7c4..98588dee 100644
--- a/.github/scripts/eval_regression_chat_objective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_obj_fullbench.py
@@ -107,6 +107,8 @@ with read_base():
     from opencompass.configs.summarizers.mmmlu_lite import \
         mmmlu_summary_groups  # noqa: F401, E501
 
+    from ...volc import infer as volc_infer  # noqa: F401, E501
+
 # For HumanEval-X Evaluation
 # Apply the evaluator ip_address and port
 race_datasets = [race_datasets[1]]
diff --git a/.github/scripts/eval_regression_chat_subjective_fullbench.py b/.github/scripts/eval_regression_chat_sub_fullbench.py
similarity index 86%
rename from .github/scripts/eval_regression_chat_subjective_fullbench.py
rename to .github/scripts/eval_regression_chat_sub_fullbench.py
index 60495f22..6ef87752 100644
--- a/.github/scripts/eval_regression_chat_subjective_fullbench.py
+++ b/.github/scripts/eval_regression_chat_sub_fullbench.py
@@ -22,8 +22,7 @@ with read_base():
         arenahard_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
         compassarena_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import \
-        fofo_datasets  # noqa: F401, E501
+    # from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
         followbench_llmeval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
@@ -35,6 +34,8 @@ with read_base():
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
         models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
 
+    from ...volc import infer as volc_infer  # noqa: F401, E501
+
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
                 and 'mtbench101' not in k and 'wildbench' not in k), [])
 datasets += mtbench101_datasets  # noqa: F401, E501
@@ -73,25 +74,15 @@ eval = dict(
 
 summary_groups = []
 summary_groups.append({
-    'name':
-    'compassarena_language',
+    'name': 'compassarena_language',
     'subsets': [
         ['compassarena_language', '内容总结'],
-        ['compassarena_language', '情感分析'],
-        ['compassarena_language', 'Information Retrival'],
-        ['compassarena_language', '综合问答'],
-        ['compassarena_language', '中华文化'],
     ],
 })
 summary_groups.append({
-    'name':
-    'compassarena_knowledge',
+    'name': 'compassarena_knowledge',
     'subsets': [
         ['compassarena_knowledge', '生活常识_ZH'],
-        ['compassarena_knowledge', '自然科学工科_ZH'],
-        ['compassarena_knowledge', '人文科学_ZH'],
-        ['compassarena_knowledge', '自然科学理科_ZH'],
-        ['compassarena_knowledge', '社会科学_ZH'],
     ],
 })
 summary_groups.append({
@@ -101,21 +92,15 @@ summary_groups.append({
     ],
 })
 summary_groups.append({
-    'name':
-    'compassarena_math_v2',
+    'name': 'compassarena_math_v2',
     'subsets': [
         ['compassarena_math_v2', '高等数学_ZH'],
-        ['compassarena_math_v2', '初等数学_ZH'],
-        ['compassarena_math_v2', '中等数学_ZH'],
     ],
 })
 summary_groups.append({
-    'name':
-    'compassarena_creationv2_zh',
+    'name': 'compassarena_creationv2_zh',
     'subsets': [
         ['compassarena_creationv2_zh', '内容扩写_ZH'],
-        ['compassarena_creationv2_zh', '内容续写_ZH'],
-        ['compassarena_creationv2_zh', '内容改写_ZH'],
     ],
 })
 summary_groups.append({
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index d53c5bf5..6ad6e295 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -6,74 +6,19 @@ import yaml
 
 output_path = 'regression_result_daily'
 
-chat_model_list = [
-    'baichuan2-7b-chat-hf',
-    'glm-4-9b-chat-hf',
-    'glm-4-9b-chat-turbomind',
-    'glm-4-9b-chat-vllm',
-    'deepseek-7b-chat-hf',
-    'deepseek-moe-16b-chat-hf',
-    'deepseek-7b-chat-vllm',
-    'gemma2-2b-it-hf',
-    'gemma2-9b-it-hf',
-    'gemma-2b-it-hf',
-    'gemma-7b-it-hf',
-    'gemma-2-9b-it-turbomind',
-    'gemma-7b-it-vllm',
-    'internlm2_5-7b-chat-hf',
-    'internlm2_5-7b-chat-turbomind',
-    'internlm2-chat-1.8b-turbomind',
-    'internlm2-chat-1.8b-sft-turbomind',
-    'internlm2-chat-7b-lmdeploy',
-    'internlm2-chat-7b-sft-turbomind',
-    'internlm2-chat-7b-vllm',
-    'llama-3_1-8b-instruct-hf',
-    'llama-3_2-3b-instruct-hf',
-    'llama-3-8b-instruct-hf',
-    'llama-3_1-8b-instruct-turbomind',
-    'llama-3_2-3b-instruct-turbomind',
-    'llama-3-8b-instruct-turbomind',
-    'mistral-7b-instruct-v0.2-hf',
-    'mistral-7b-instruct-v0.3-hf',
-    'mistral-nemo-instruct-2407-hf',
-    'mistral-nemo-instruct-2407-turbomind',
-    'mistral-7b-instruct-v0.1-vllm',
-    'mistral-7b-instruct-v0.2-vllm',
-    # 'MiniCPM3-4B-hf', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
-    # 'minicpm-2b-sft-fp32-hf',
-    'phi-3-mini-4k-instruct-hf',
-    'qwen1.5-0.5b-chat-hf',
-    'qwen2-1.5b-instruct-hf',
-    'qwen2-7b-instruct-hf',
-    'qwen2-1.5b-instruct-turbomind',
-    'qwen2-7b-instruct-turbomind',
-    'qwen1.5-0.5b-chat-vllm',
-    'yi-1.5-6b-chat-hf',
-    'yi-1.5-9b-chat-hf',
-    'deepseek-v2-lite-chat-hf',
-    'internlm2_5-20b-chat-hf',
-    'internlm2_5-20b-chat-turbomind',
-    'mistral-small-instruct-2409-hf',
-    'mistral-small-instruct-2409-turbomind',
-    'qwen2.5-14b-instruct-hf',
-    'qwen2.5-14b-instruct-turbomind'
-]
-base_model_list = [
-    'glm-4-9b-hf', 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
-    'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', 'gemma2-9b-hf',
-    'gemma-2b-hf', 'gemma-7b-hf', 'gemma-2b-vllm', 'gemma-7b-vllm',
-    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
-    'internlm2-1.8b-turbomind', 'internlm2_5-7b-turbomind',
-    'internlm2-7b-turbomind', 'internlm2-base-7b-turbomind', 'llama-2-7b-hf',
-    'llama-3_1-8b-hf', 'llama-3-8b-hf', 'llama-3.1-8b-turbomind',
-    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'mistral-7b-v0.3-hf',
-    'mistral-7b-v0.2-vllm', 'qwen2.5-7b-hf', 'qwen2.5-1.5b-turbomind',
-    'qwen2.5-7b-turbomind', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
-    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
-    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf',
-    'deepseek-v2-lite-hf', 'internlm2-20b-hf', 'internlm2-base-20b-hf',
-    'internlm2-20b-turbomind', 'qwen2.5-14b-hf'
-]
+
+def model_list(type):
+    config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config.get(type).keys()
+
+
+def dataset_list(model, type):
+    config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config.get(model).get(type).keys()
 
 
 @pytest.fixture()
@@ -115,36 +60,39 @@ def result_scores():
 
 @pytest.mark.usefixtures('result_scores')
 @pytest.mark.usefixtures('baseline_scores_testrange')
-@pytest.mark.chat
+@pytest.mark.chat_models
 class TestChat:
     """Test cases for chat model."""
 
     @pytest.mark.parametrize(
-        'model, dataset', [(p1, p2) for p1 in chat_model_list
+        'model, dataset', [(p1, p2) for p1 in model_list('chat')
                            for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
     def test_model_dataset_score(self, baseline_scores_testrange,
                                  result_scores, model, dataset):
-        base_score = baseline_scores_testrange.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get('chat').get(model).get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model, result_score, base_score)
 
 
 @pytest.mark.usefixtures('result_scores')
 @pytest.mark.usefixtures('baseline_scores_testrange')
-@pytest.mark.base
+@pytest.mark.base_models
 class TestBase:
     """Test cases for base model."""
 
-    @pytest.mark.parametrize('model, dataset', [
-        (p1, p2) for p1 in base_model_list for p2 in
-        ['gsm8k_accuracy', 'GPQA_diamond', 'race-high_accuracy', 'winogrande']
-    ])
+    @pytest.mark.parametrize('model, dataset',
+                             [(p1, p2) for p1 in model_list('base') for p2 in [
+                                 'gsm8k_accuracy', 'GPQA_diamond_accuracy',
+                                 'race-high_accuracy', 'winogrande_accuracy'
+                             ]])
     def test_model_dataset_score(self, baseline_scores_testrange,
                                  result_scores, model, dataset):
         if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
                      ] and dataset != 'gsm8k_accuracy':
             return
-        base_score = baseline_scores_testrange.get(model).get(dataset)
+        base_score = baseline_scores_testrange.get('base').get(model).get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model, result_score, base_score)
 
@@ -158,28 +106,11 @@ class TestChatObjFullbench:
     @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
         'internlm2_5-7b-chat-hf_fullbench',
         'internlm2_5-7b-chat-turbomind_fullbench'
-    ] for p2 in [
-        'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
-        'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
-        'IFEval_Prompt-level-strict-accuracy', 'drop_accuracy',
-        'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
-        'musr_average_naive_average', 'korbench_single_naive_average',
-        'gsm8k_accuracy', 'math_accuracy', 'cmo_fib_accuracy',
-        'aime2024_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
-        'sanitized_mbpp_score', 'ds1000_naive_average',
-        'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
-        'lcb_test_output_pass@1', 'bbh-logical_deduction_seven_objects_score',
-        'bbh-multistep_arithmetic_two_score', 'mmlu-other_naive_average',
-        'cmmlu-china-specific_naive_average', 'mmlu_pro_math_accuracy',
-        'ds1000_Pandas_accuracy', 'ds1000_Numpy_accuracy',
-        'ds1000_Tensorflow_accuracy', 'ds1000_Scipy_accuracy',
-        'ds1000_Sklearn_accuracy', 'ds1000_Pytorch_accuracy',
-        'ds1000_Matplotlib_accuracy', 'openai_mmmlu_lite_AR-XY_accuracy',
-        'college_naive_average', 'college_knowledge_naive_average'
-    ]])
+    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
     def test_model_dataset_score(self, baseline_scores_fullbench,
                                  result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model, result_score, base_score)
 
@@ -193,32 +124,12 @@ class TestChatSubFullbench:
     @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
         'internlm2_5-7b-chat-hf_fullbench',
         'internlm2_5-7b-chat-turbomind_fullbench'
-    ] for p2 in [
-        'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score',
-        'Followbench_naive_average', 'CompassArena_naive_average',
-        'mtbench101_avg', 'wildbench_average',
-        'simpleqa_accuracy_given_attempted',
-        'chinese_simpleqa_given_attempted_accuracy',
-        'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
-        'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
-        'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
-        'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
-        'alpaca_eval_helpful_base', 'compassarena_language_naive_average',
-        'compassarena_knowledge_naive_average',
-        'compassarena_reason_v2_naive_average',
-        'compassarena_math_v2_naive_average',
-        'compassarena_creationv2_zh_naive_average',
-        'fofo_test_prompts_overall', 'followbench_llmeval_en_HSR_AVG',
-        'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
-        'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
-        'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
-        'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
-        'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
-        'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
-    ]])
+    ] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
+                             )
     def test_model_dataset_score(self, baseline_scores_fullbench,
                                  result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get(
+            'subjective').get(dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model, result_score, base_score)
 
@@ -229,25 +140,15 @@ class TestChatSubFullbench:
 class TestBaseFullbench:
     """Test cases for chat model."""
 
-    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
-        'internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench'
-    ] for p2 in [
-        'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
-        'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy',
-        'GPQA_diamond_accuracy', 'hellaswag_accuracy', 'TheoremQA_score',
-        'winogrande_accuracy', 'gsm8k_accuracy',
-        'GaokaoBench_2010-2022_Math_II_MCQs_score',
-        'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score',
-        'math_accuracy', 'wikibench-wiki-single_choice_cncircular_perf_4',
-        'sanitized_mbpp_score', 'dingo_en_192_score', 'dingo_zh_170_score',
-        'mmlu-other_accuracy', 'cmmlu-china-specific_accuracy',
-        'mmlu_pro_math_accuracy', 'bbh-logical_deduction_seven_objects_score',
-        'bbh-multistep_arithmetic_two_score', 'college_naive_average',
-        'college_knowledge_naive_average'
-    ]])
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [(p1, p2) for p1 in
+         ['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
+         for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
     def test_model_dataset_score(self, baseline_scores_fullbench,
                                  result_scores, model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model, result_score, base_score)
 
@@ -274,193 +175,64 @@ class TestApibench:
 class TestVolcFullbench:
     """Test cases for chat model."""
 
-    @pytest.mark.parametrize('model, dataset', [(
-        p1, p2
-    ) for p1 in ['internlm2_5-7b-chat-turbomind'] for p2 in [
-        'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
-        'triviaqa_wiki_1shot_score', 'nq_open_1shot_score',
-        'mmmlu_lite_naive_average', 'IFEval_Prompt-level-strict-accuracy',
-        'drop_accuracy', 'bbh_naive_average', 'GPQA_diamond_accuracy',
-        'hellaswag_accuracy', 'TheoremQA_score', 'musr_average_naive_average',
-        'korbench_single_naive_average',
-        'ARC_Prize_Public_Evaluation_accuracy', 'gsm8k_accuracy',
-        'GaokaoBench_weighted_average', 'math_accuracy', 'cmo_fib_accuracy',
-        'aime2024_accuracy', 'Mathbench_naive_average',
-        'wikibench-wiki-single_choice_cncircular_perf_4',
-        'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
-        'openai_humaneval_humaneval_pass@1', 'sanitized_mbpp_score',
-        'humanevalx_naive_average', 'ds1000_naive_average',
-        'lcb_code_generation_pass@1', 'lcb_code_execution_pass@1',
-        'lcb_test_output_pass@1', 'bigcodebench_hard_instruct_pass@1',
-        'bigcodebench_hard_complete_pass@1', 'teval_naive_average',
-        'qa_dingo_cn_score', 'mmlu-stem_naive_average',
-        'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
-        'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
-        'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
-        'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
-        'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
-        'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
-        'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
-        'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
-        'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
-        'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
-        'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
-        'humanevalx-python_pass@1', 'humanevalx-cpp_pass@1',
-        'humanevalx-go_pass@1', 'humanevalx-java_pass@1',
-        'humanevalx-js_pass@1', 'ds1000_Pandas_accuracy',
-        'ds1000_Numpy_accuracy', 'ds1000_Tensorflow_accuracy',
-        'ds1000_Scipy_accuracy', 'ds1000_Sklearn_accuracy',
-        'ds1000_Pytorch_accuracy', 'ds1000_Matplotlib_accuracy',
-        'openai_mmmlu_lite_AR-XY_accuracy', 'openai_mmmlu_lite_BN-BD_accuracy',
-        'openai_mmmlu_lite_DE-DE_accuracy', 'openai_mmmlu_lite_ES-LA_accuracy',
-        'openai_mmmlu_lite_FR-FR_accuracy', 'openai_mmmlu_lite_HI-IN_accuracy',
-        'openai_mmmlu_lite_ID-ID_accuracy', 'openai_mmmlu_lite_IT-IT_accuracy',
-        'openai_mmmlu_lite_JA-JP_accuracy', 'openai_mmmlu_lite_KO-KR_accuracy',
-        'openai_mmmlu_lite_PT-BR_accuracy', 'openai_mmmlu_lite_SW-KE_accuracy',
-        'openai_mmmlu_lite_YO-NG_accuracy', 'openai_mmmlu_lite_ZH-CN_accuracy',
-        'college_naive_average', 'high_naive_average', 'middle_naive_average',
-        'primary_naive_average', 'arithmetic_naive_average',
-        'mathbench-a (average)_naive_average',
-        'college_knowledge_naive_average', 'high_knowledge_naive_average',
-        'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
-        'mathbench-t (average)_naive_average'
-    ]])
-    @pytest.mark.chat_objective
-    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
-                            model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
-        result_score = result_scores.get(model).get(dataset)
-        assert_score(model + '_batch', result_score, base_score)
-
     @pytest.mark.parametrize(
         'model, dataset',
         [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
-         for p2 in [
-             'alignment_bench_v1_1_总分', 'alpaca_eval_total', 'arenahard_score',
-             'Followbench_naive_average', 'CompassArena_naive_average',
-             'FoFo_naive_average', 'mtbench101_avg', 'wildbench_average',
-             'simpleqa_accuracy_given_attempted',
-             'chinese_simpleqa_given_attempted_accuracy',
-             'alignment_bench_v1_1_专业能力', 'alignment_bench_v1_1_数学计算',
-             'alignment_bench_v1_1_基本任务', 'alignment_bench_v1_1_逻辑推理',
-             'alignment_bench_v1_1_中文理解', 'alignment_bench_v1_1_文本写作',
-             'alignment_bench_v1_1_角色扮演', 'alignment_bench_v1_1_综合问答',
-             'alpaca_eval_helpful_base', 'alpaca_eval_koala',
-             'alpaca_eval_oasst', 'alpaca_eval_selfinstruct',
-             'alpaca_eval_vicuna', 'compassarena_language_naive_average',
-             'compassarena_knowledge_naive_average',
-             'compassarena_reason_v2_naive_average',
-             'compassarena_math_v2_naive_average',
-             'compassarena_creationv2_zh_naive_average',
-             'fofo_test_prompts_overall', 'fofo_test_prompts_cn_overall',
-             'followbench_llmeval_en_HSR_AVG',
-             'followbench_llmeval_en_SSR_AVG', 'followbench_llmeval_en_HSR_L1',
-             'followbench_llmeval_en_HSR_L2', 'followbench_llmeval_en_HSR_L3',
-             'followbench_llmeval_en_HSR_L4', 'followbench_llmeval_en_HSR_L5',
-             'followbench_llmeval_en_SSR_L1', 'followbench_llmeval_en_SSR_L2',
-             'followbench_llmeval_en_SSR_L3', 'followbench_llmeval_en_SSR_L4',
-             'followbench_llmeval_en_SSR_L5', 'simpleqa_f1'
-         ]])
-    @pytest.mark.chat_subjective
-    def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
-                             model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+    @pytest.mark.chat_objective
+    def test_chat_objective(self, baseline_scores_fullbench, result_scores,
+                            model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model + '_batch', result_score, base_score)
 
-    @pytest.mark.parametrize('model, dataset', [(
-        p1, p2
-    ) for p1 in ['internlm2_5-7b-turbomind'] for p2 in [
-        'race-high_accuracy', 'ARC-c_accuracy', 'BoolQ_accuracy',
-        'triviaqa_wiki_1shot_score', 'nq_open_1shot_score', 'drop_accuracy',
-        'bbh_naive_average', 'GPQA_diamond_accuracy', 'hellaswag_accuracy',
-        'TheoremQA_score', 'winogrande_accuracy', 'gsm8k_accuracy',
-        'GaokaoBench_weighted_average', 'math_accuracy',
-        'Mathbench_naive_average',
-        'wikibench-wiki-single_choice_cncircular_perf_4',
-        'cmmlu_naive_average', 'mmlu_naive_average', 'mmlu_pro_naive_average',
-        'openai_humaneval_humaneval_pass@1',
-        'openai_humaneval_v2_humaneval_pass@1', 'sanitized_mbpp_score',
-        'dingo_en_192_score', 'dingo_zh_170_score', 'mmlu-stem_naive_average',
-        'mmlu-social-science_naive_average', 'mmlu-humanities_naive_average',
-        'mmlu-other_naive_average', 'cmmlu-stem_naive_average',
-        'cmmlu-social-science_naive_average', 'cmmlu-humanities_naive_average',
-        'cmmlu-other_naive_average', 'cmmlu-china-specific_naive_average',
-        'mmlu_pro_biology_accuracy', 'mmlu_pro_business_accuracy',
-        'mmlu_pro_chemistry_accuracy', 'mmlu_pro_computer_science_accuracy',
-        'mmlu_pro_economics_accuracy', 'mmlu_pro_engineering_accuracy',
-        'mmlu_pro_health_accuracy', 'mmlu_pro_history_accuracy',
-        'mmlu_pro_law_accuracy', 'mmlu_pro_math_accuracy',
-        'mmlu_pro_philosophy_accuracy', 'mmlu_pro_physics_accuracy',
-        'mmlu_pro_psychology_accuracy', 'mmlu_pro_other_accuracy',
-        'college_naive_average', 'high_naive_average', 'middle_naive_average',
-        'primary_naive_average', 'arithmetic_naive_average',
-        'mathbench-a (average)_naive_average',
-        'college_knowledge_naive_average', 'high_knowledge_naive_average',
-        'middle_knowledge_naive_average', 'primary_knowledge_naive_average',
-        'mathbench-t (average)_naive_average'
-    ]])
-    @pytest.mark.base_objective
-    def test_base_objective(self, baseline_scores_fullbench, result_scores,
-                            model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+    @pytest.mark.parametrize('model, dataset', [
+        (p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
+        for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
+    ])
+    @pytest.mark.chat_subjective
+    def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
+                             model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(
+            'subjective').get(dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model + '_batch', result_score, base_score)
 
     @pytest.mark.parametrize(
         'model, dataset',
         [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
-         for p2 in [
-             'Single-Needle-Retrieval(S-RT)-32000_naive_average',
-             'Single-Needle-Retrieval-EN-32000_naive_average',
-             'Single-Needle-Retrieval-ZH-32000_naive_average',
-             'Single-Needle-Retrieval(S-RT)-100000_naive_average',
-             'Single-Needle-Retrieval-EN-100000_naive_average',
-             'Single-Needle-Retrieval-ZH-100000_naive_average',
-             'Single-Needle-Retrieval(S-RT)-200000_naive_average',
-             'Single-Needle-Retrieval-EN-200000_naive_average',
-             'Single-Needle-Retrieval-ZH-200000_naive_average',
-             'longbench_naive_average', 'longbench_zh_naive_average',
-             'longbench_en_naive_average',
-             'longbench_single-document-qa_naive_average',
-             'longbench_multi-document-qa_naive_average',
-             'longbench_summarization_naive_average',
-             'longbench_few-shot-learning_naive_average',
-             'longbench_synthetic-tasks_naive_average',
-             'longbench_code-completion_naive_average'
-         ]])
-    @pytest.mark.base_long_context
-    def test_base_long_context(self, baseline_scores_fullbench, result_scores,
-                               model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+         for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
+    @pytest.mark.base_objective
+    def test_base_objective(self, baseline_scores_fullbench, result_scores,
+                            model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get('objective').get(
+            dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model + '_batch', result_score, base_score)
 
     @pytest.mark.parametrize(
         'model, dataset',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-1m-turbomind']
-         for p2 in [
-             'ruler_8k_naive_average', 'ruler_32k_naive_average',
-             'ruler_128k_naive_average',
-             'NeedleBench-Overall-Score-8K_weighted_average',
-             'NeedleBench-Overall-Score-32K_weighted_average',
-             'NeedleBench-Overall-Score-128K_weighted_average',
-             'longbench_naive_average', 'longbench_zh_naive_average',
-             'longbench_en_naive_average', 'babilong_0k_naive_average',
-             'babilong_4k_naive_average', 'babilong_16k_naive_average',
-             'babilong_32k_naive_average', 'babilong_128k_naive_average',
-             'babilong_256k_naive_average',
-             'longbench_single-document-qa_naive_average',
-             'longbench_multi-document-qa_naive_average',
-             'longbench_summarization_naive_average',
-             'longbench_few-shot-learning_naive_average',
-             'longbench_synthetic-tasks_naive_average',
-             'longbench_code-completion_naive_average'
-         ]])
+        [(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
+         for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
+    @pytest.mark.base_long_context
+    def test_base_long_context(self, baseline_scores_fullbench, result_scores,
+                               model, dataset):
+        base_score = baseline_scores_fullbench.get(model).get(
+            'long_context').get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score)
+
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [(p1, p2)
+         for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
+             'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
     @pytest.mark.chat_long_context
     def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
                                model, dataset):
-        base_score = baseline_scores_fullbench.get(model).get(dataset)
+        base_score = baseline_scores_fullbench.get(model).get(
+            'long_context').get(dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(model + '_batch', result_score, base_score)
 
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index 681ac5d3..a8e40891 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -14,12 +14,12 @@ internlm2-1.8b-hf:
     race-high_accuracy: 66.38
 
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 84.38
+    demo_gsm8k_accuracy: 89.06
     race-middle_accuracy: 92.76
     race-high_accuracy: 90.54
 
 internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k_accuracy: 31
+    demo_gsm8k_accuracy: 32
     race-middle_accuracy: 81.34
     race-high_accuracy: 73.96
 
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 0359b633..568ed5fd 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -1,447 +1,456 @@
 internlm2_5-7b-chat-hf_fullbench:
-    race-high_accuracy: 93.75
-    ARC-c_accuracy: 93.75
-    BoolQ_accuracy: 81.25
-    triviaqa_wiki_1shot_score: 50
-    nq_open_1shot_score: 25
-    IFEval_Prompt-level-strict-accuracy: 50
-    drop_accuracy: 81.25
-    GPQA_diamond_accuracy: 25
-    hellaswag_accuracy: 87.5
-    TheoremQA_score: 18.75
-    musr_average_naive_average: 39.58
-    korbench_single_naive_average: 40
-    gsm8k_accuracy: 62.50
-    math_accuracy: 75
-    cmo_fib_accuracy: 6.25
-    aime2024_accuracy: 6.25
-    wikibench-wiki-single_choice_cncircular_perf_4: 50
-    sanitized_mbpp_score: 68.75
-    ds1000_naive_average: 16.96
-    lcb_code_generation_pass@1: 12.5
-    lcb_code_execution_pass@1: 43.75
-    lcb_test_output_pass@1: 18.75
-    bbh-logical_deduction_seven_objects_score: 50
-    bbh-multistep_arithmetic_two_score: 68.75
-    mmlu-other_naive_average: 72.6
-    cmmlu-china-specific_naive_average: 76.25
-    mmlu_pro_math_accuracy: 25
-    ds1000_Pandas_accuracy: 12.5
-    ds1000_Numpy_accuracy: 0
-    ds1000_Tensorflow_accuracy: 12.5
-    ds1000_Scipy_accuracy: 18.75
-    ds1000_Sklearn_accuracy: 18.75
-    ds1000_Pytorch_accuracy: 12.5
-    ds1000_Matplotlib_accuracy: 43.75
-    openai_mmmlu_lite_AR-XY_accuracy: 37.5
-    college_naive_average: 12.5
-    college_knowledge_naive_average: 87.5
-    alignment_bench_v1_1_总分: 0.66
-    alpaca_eval_total: 0
-    arenahard_score: 50
-    Followbench_naive_average: 1
-    CompassArena_naive_average: 54.48
-    mtbench101_avg: 8.1
-    wildbench_average: -9.86
-    simpleqa_accuracy_given_attempted: 0
-    chinese_simpleqa_given_attempted_accuracy: 1
-    alignment_bench_v1_1_专业能力: 8
-    alignment_bench_v1_1_数学计算: 0
-    alignment_bench_v1_1_基本任务: 0
-    alignment_bench_v1_1_逻辑推理: 0
-    alignment_bench_v1_1_中文理解: 0
-    alignment_bench_v1_1_文本写作: 0
-    alignment_bench_v1_1_角色扮演: 0
-    alignment_bench_v1_1_综合问答: 0
-    alpaca_eval_helpful_base: 0
-    compassarena_language_naive_average: 62
-    compassarena_knowledge_naive_average: 56
-    compassarena_reason_v2_naive_average: 49
-    compassarena_math_v2_naive_average: 57.05
-    compassarena_creationv2_zh_naive_average: 48.34
-    fofo_test_prompts_overall: 1
-    followbench_llmeval_en_HSR_AVG: 1
-    followbench_llmeval_en_SSR_AVG: 1
-    followbench_llmeval_en_HSR_L1: 1
-    followbench_llmeval_en_HSR_L2: 1
-    followbench_llmeval_en_HSR_L3: 1
-    followbench_llmeval_en_HSR_L4: 1
-    followbench_llmeval_en_HSR_L5: 1
-    followbench_llmeval_en_SSR_L1: 1
-    followbench_llmeval_en_SSR_L2: 1
-    followbench_llmeval_en_SSR_L3: 1
-    followbench_llmeval_en_SSR_L4: 1
-    followbench_llmeval_en_SSR_L5: 1
-    simpleqa_f1: 0
+    objective:
+        race-high_accuracy: 93.75
+        ARC-c_accuracy: 93.75
+        BoolQ_accuracy: 81.25
+        triviaqa_wiki_1shot_score: 50
+        nq_open_1shot_score: 25
+        IFEval_Prompt-level-strict-accuracy: 50
+        drop_accuracy: 81.25
+        GPQA_diamond_accuracy: 25
+        hellaswag_accuracy: 87.5
+        TheoremQA_score: 18.75
+        musr_average_naive_average: 39.58
+        korbench_single_naive_average: 40
+        gsm8k_accuracy: 62.50
+        math_accuracy: 75
+        cmo_fib_accuracy: 6.25
+        aime2024_accuracy: 6.25
+        wikibench-wiki-single_choice_cncircular_perf_4: 50
+        sanitized_mbpp_score: 68.75
+        ds1000_naive_average: 16.96
+        lcb_code_generation_pass@1: 12.5
+        lcb_code_execution_pass@1: 43.75
+        lcb_test_output_pass@1: 18.75
+        bbh-logical_deduction_seven_objects_score: 50
+        bbh-multistep_arithmetic_two_score: 68.75
+        mmlu-other_naive_average: 72.6
+        cmmlu-china-specific_naive_average: 76.25
+        mmlu_pro_math_accuracy: 25
+        ds1000_Pandas_accuracy: 12.5
+        ds1000_Numpy_accuracy: 0
+        ds1000_Tensorflow_accuracy: 12.5
+        ds1000_Scipy_accuracy: 18.75
+        ds1000_Sklearn_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 12.5
+        ds1000_Matplotlib_accuracy: 43.75
+        openai_mmmlu_lite_AR-XY_accuracy: 37.5
+        college_naive_average: 12.5
+        college_knowledge_naive_average: 87.5
+    subjective:
+        alignment_bench_v1_1_总分: 0.66
+        alpaca_eval_total: 20
+        arenahard_score: 50
+        Followbench_naive_average: 1
+        CompassArena_naive_average: 44.00
+        mtbench101_avg: 7.8
+        wildbench_average: -12.78
+        simpleqa_accuracy_given_attempted: 0
+        chinese_simpleqa_given_attempted_accuracy: 1
+        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_数学计算: 0
+        alignment_bench_v1_1_基本任务: 0
+        alignment_bench_v1_1_逻辑推理: 0
+        alignment_bench_v1_1_中文理解: 0
+        alignment_bench_v1_1_文本写作: 0
+        alignment_bench_v1_1_角色扮演: 0
+        alignment_bench_v1_1_综合问答: 0
+        alpaca_eval_helpful_base: 20
+        compassarena_language_naive_average: 35
+        compassarena_knowledge_naive_average: 55
+        compassarena_reason_v2_naive_average: 45.00
+        compassarena_math_v2_naive_average: 55
+        compassarena_creationv2_zh_naive_average: 30
+        followbench_llmeval_en_HSR_AVG: 1
+        followbench_llmeval_en_SSR_AVG: 1
+        followbench_llmeval_en_HSR_L1: 1
+        followbench_llmeval_en_HSR_L2: 1
+        followbench_llmeval_en_HSR_L3: 1
+        followbench_llmeval_en_HSR_L4: 1
+        followbench_llmeval_en_HSR_L5: 1
+        followbench_llmeval_en_SSR_L1: 1
+        followbench_llmeval_en_SSR_L2: 1
+        followbench_llmeval_en_SSR_L3: 1
+        followbench_llmeval_en_SSR_L4: 1
+        followbench_llmeval_en_SSR_L5: 1
+        simpleqa_f1: 0
 
 internlm2_5-7b-chat-turbomind_fullbench:
-    race-high_accuracy:  93.75
-    ARC-c_accuracy: 87.5
-    BoolQ_accuracy: 68.75
-    triviaqa_wiki_1shot_score: 50
-    nq_open_1shot_score: 25
-    IFEval_Prompt-level-strict-accuracy: 50
-    drop_accuracy: 75
-    GPQA_diamond_accuracy: 25
-    hellaswag_accuracy: 81.25
-    TheoremQA_score: 6.25
-    musr_average_naive_average: 37.5
-    korbench_single_naive_average: 41.25
-    gsm8k_accuracy: 68.75
-    math_accuracy: 75
-    cmo_fib_accuracy: 6.25
-    aime2024_accuracy: 6.25
-    wikibench-wiki-single_choice_cncircular_perf_4: 25
-    sanitized_mbpp_score: 68.75
-    ds1000_naive_average: 13.39
-    lcb_code_generation_pass@1: 12.5
-    lcb_code_execution_pass@1: 43.75
-    lcb_test_output_pass@1: 12.5
-    bbh-logical_deduction_seven_objects_score: 56.25
-    bbh-multistep_arithmetic_two_score: 68.75
-    mmlu-other_naive_average: 74.04
-    cmmlu-china-specific_naive_average: 76.25
-    mmlu_pro_math_accuracy: 25
-    ds1000_Pandas_accuracy: 0
-    ds1000_Numpy_accuracy: 0
-    ds1000_Tensorflow_accuracy: 12.5
-    ds1000_Scipy_accuracy: 18.75
-    ds1000_Sklearn_accuracy: 18.75
-    ds1000_Pytorch_accuracy: 6.25
-    ds1000_Matplotlib_accuracy: 37.5
-    openai_mmmlu_lite_AR-XY_accuracy: 37.5
-    college_naive_average: 0
-    college_knowledge_naive_average: 87.5
-    alignment_bench_v1_1_总分: 0.68
-    alpaca_eval_total: 10
-    arenahard_score: 50
-    Followbench_naive_average: 1
-    CompassArena_naive_average: 52.95
-    mtbench101_avg: 8.1
-    wildbench_average: -4.44
-    simpleqa_accuracy_given_attempted: 0
-    chinese_simpleqa_given_attempted_accuracy: 1
-    alignment_bench_v1_1_专业能力: 8.2
-    alignment_bench_v1_1_数学计算: 0
-    alignment_bench_v1_1_基本任务: 0
-    alignment_bench_v1_1_逻辑推理: 0
-    alignment_bench_v1_1_中文理解: 0
-    alignment_bench_v1_1_文本写作: 0
-    alignment_bench_v1_1_角色扮演: 0
-    alignment_bench_v1_1_综合问答: 0
-    alpaca_eval_helpful_base: 10
-    compassarena_language_naive_average: 61.5
-    compassarena_knowledge_naive_average: 56.5
-    compassarena_reason_v2_naive_average: 47.5
-    compassarena_math_v2_naive_average: 53.03
-    compassarena_creationv2_zh_naive_average: 46.22
-    fofo_test_prompts_overall: 1
-    followbench_llmeval_en_HSR_AVG: 1
-    followbench_llmeval_en_SSR_AVG: 1
-    followbench_llmeval_en_HSR_L1: 1
-    followbench_llmeval_en_HSR_L2: 1
-    followbench_llmeval_en_HSR_L3: 1
-    followbench_llmeval_en_HSR_L4: 1
-    followbench_llmeval_en_HSR_L5: 1
-    followbench_llmeval_en_SSR_L1: 1
-    followbench_llmeval_en_SSR_L2: 1
-    followbench_llmeval_en_SSR_L3: 1
-    followbench_llmeval_en_SSR_L4: 1
-    followbench_llmeval_en_SSR_L5: 1
-    simpleqa_f1: 0
+    objective:
+        race-high_accuracy:  93.75
+        ARC-c_accuracy: 93.75
+        BoolQ_accuracy: 68.75
+        triviaqa_wiki_1shot_score: 50
+        nq_open_1shot_score: 25
+        IFEval_Prompt-level-strict-accuracy: 56.25
+        drop_accuracy: 81.25
+        GPQA_diamond_accuracy: 31.25
+        hellaswag_accuracy: 81.25
+        TheoremQA_score: 6.25
+        musr_average_naive_average: 39.58
+        korbench_single_naive_average: 37.50
+        gsm8k_accuracy: 68.75
+        math_accuracy: 68.75
+        cmo_fib_accuracy: 6.25
+        aime2024_accuracy: 6.25
+        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        sanitized_mbpp_score: 68.75
+        ds1000_naive_average: 16.96
+        lcb_code_generation_pass@1: 12.5
+        lcb_code_execution_pass@1: 43.75
+        lcb_test_output_pass@1: 25.00
+        bbh-logical_deduction_seven_objects_score: 50.00
+        bbh-multistep_arithmetic_two_score: 68.75
+        mmlu-other_naive_average: 69.71
+        cmmlu-china-specific_naive_average: 75.83
+        mmlu_pro_math_accuracy: 31.25
+        ds1000_Pandas_accuracy: 0
+        ds1000_Numpy_accuracy: 0
+        ds1000_Tensorflow_accuracy: 12.5
+        ds1000_Scipy_accuracy: 18.75
+        ds1000_Sklearn_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Matplotlib_accuracy: 50.00
+        openai_mmmlu_lite_AR-XY_accuracy: 37.5
+        college_naive_average: 12.50
+        college_knowledge_naive_average: 87.5
+    subjective:
+        alignment_bench_v1_1_总分: 0.70
+        alpaca_eval_total: 0
+        arenahard_score: 50
+        Followbench_naive_average: 1
+        CompassArena_naive_average: 38
+        mtbench101_avg: 7.80
+        wildbench_average: -4.86
+        simpleqa_accuracy_given_attempted: 0
+        chinese_simpleqa_given_attempted_accuracy: 1
+        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_数学计算: 0
+        alignment_bench_v1_1_基本任务: 0
+        alignment_bench_v1_1_逻辑推理: 0
+        alignment_bench_v1_1_中文理解: 0
+        alignment_bench_v1_1_文本写作: 0
+        alignment_bench_v1_1_角色扮演: 0
+        alignment_bench_v1_1_综合问答: 0
+        alpaca_eval_helpful_base: 0
+        compassarena_language_naive_average: 35
+        compassarena_knowledge_naive_average: 50
+        compassarena_reason_v2_naive_average: 30
+        compassarena_math_v2_naive_average: 50
+        compassarena_creationv2_zh_naive_average: 25
+        followbench_llmeval_en_HSR_AVG: 1
+        followbench_llmeval_en_SSR_AVG: 1
+        followbench_llmeval_en_HSR_L1: 1
+        followbench_llmeval_en_HSR_L2: 1
+        followbench_llmeval_en_HSR_L3: 1
+        followbench_llmeval_en_HSR_L4: 1
+        followbench_llmeval_en_HSR_L5: 1
+        followbench_llmeval_en_SSR_L1: 1
+        followbench_llmeval_en_SSR_L2: 1
+        followbench_llmeval_en_SSR_L3: 1
+        followbench_llmeval_en_SSR_L4: 1
+        followbench_llmeval_en_SSR_L5: 1
+        simpleqa_f1: 0
 
 internlm2_5-7b-hf_fullbench:
-    race-high_accuracy: 100
-    ARC-c_accuracy: 68.75
-    BoolQ_accuracy: 87.5
-    triviaqa_wiki_1shot_score: 43.75
-    nq_open_1shot_score: 43.75
-    drop_accuracy: 62.5
-    GPQA_diamond_accuracy: 62.5
-    hellaswag_accuracy: 93.75
-    TheoremQA_score: 25
-    winogrande_accuracy: 75
-    gsm8k_accuracy: 37.5
-    GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
-    GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-    math_accuracy: 12.5
-    wikibench-wiki-single_choice_cncircular_perf_4: 25
-    sanitized_mbpp_score: 56.25
-    dingo_en_192_score: 37.5
-    dingo_zh_170_score: 100
-    mmlu-other_accuracy: 76.92
-    cmmlu-china-specific_accuracy: 84.17
-    mmlu_pro_math_accuracy: 18.75
-    bbh-logical_deduction_seven_objects_score: 43.75
-    bbh-multistep_arithmetic_two_score: 56.25
-    college_naive_average: 12.5
-    college_knowledge_naive_average: 87.5
+    objective:
+        race-high_accuracy: 100
+        ARC-c_accuracy: 68.75
+        BoolQ_accuracy: 87.5
+        triviaqa_wiki_1shot_score: 43.75
+        nq_open_1shot_score: 43.75
+        drop_accuracy: 62.5
+        GPQA_diamond_accuracy: 62.5
+        hellaswag_accuracy: 93.75
+        TheoremQA_score: 25
+        winogrande_accuracy: 75
+        gsm8k_accuracy: 37.5
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
+        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
+        math_accuracy: 12.5
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
+        sanitized_mbpp_score: 56.25
+        dingo_en_192_score: 37.5
+        dingo_zh_170_score: 100
+        mmlu-other_accuracy: 76.92
+        cmmlu-china-specific_accuracy: 84.17
+        mmlu_pro_math_accuracy: 18.75
+        bbh-logical_deduction_seven_objects_score: 43.75
+        bbh-multistep_arithmetic_two_score: 56.25
+        college_naive_average: 12.5
+        college_knowledge_naive_average: 87.5
 
 internlm2_5-7b-turbomind_fullbench:
-    race-high_accuracy: 100
-    ARC-c_accuracy: 68.75
-    BoolQ_accuracy: 87.5
-    triviaqa_wiki_1shot_score: 43.75
-    nq_open_1shot_score: 43.75
-    drop_accuracy: 62.5
-    GPQA_diamond_accuracy: 62.5
-    hellaswag_accuracy: 93.75
-    TheoremQA_score: 31.25
-    winogrande_accuracy: 87.5
-    gsm8k_accuracy: 68.75
-    GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
-    GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-    math_accuracy: 18.75
-    wikibench-wiki-single_choice_cncircular_perf_4: 25
-    sanitized_mbpp_score: 56.25
-    dingo_en_192_score: 43.75
-    dingo_zh_170_score: 100
-    mmlu-other_accuracy: 76.92
-    cmmlu-china-specific_accuracy: 84.17
-    mmlu_pro_math_accuracy: 18.75
-    bbh-logical_deduction_seven_objects_score: 50
-    bbh-multistep_arithmetic_two_score: 56.25
-    college_naive_average: 12.5
-    college_knowledge_naive_average: 87.5
+    objective:
+        race-high_accuracy: 100
+        ARC-c_accuracy: 68.75
+        BoolQ_accuracy: 87.5
+        triviaqa_wiki_1shot_score: 43.75
+        nq_open_1shot_score: 43.75
+        drop_accuracy: 62.5
+        GPQA_diamond_accuracy: 62.5
+        hellaswag_accuracy: 93.75
+        TheoremQA_score: 25.00
+        winogrande_accuracy: 87.5
+        gsm8k_accuracy: 62.50
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
+        math_accuracy: 18.75
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
+        sanitized_mbpp_score: 62.50
+        dingo_en_192_score: 31.25
+        dingo_zh_170_score: 93.75
+        mmlu-other_accuracy: 76.92
+        cmmlu-china-specific_accuracy: 84.17
+        mmlu_pro_math_accuracy: 18.75
+        bbh-logical_deduction_seven_objects_score: 50
+        bbh-multistep_arithmetic_two_score: 56.25
+        college_naive_average: 12.5
+        college_knowledge_naive_average: 87.5
 
 internlm2_5-7b-turbomind:
-    race-high_accuracy: 89.28
-    ARC-c_accuracy: 52.2
-    BoolQ_accuracy: 89.72
-    triviaqa_wiki_1shot_score: 65.88
-    nq_open_1shot_score: 34.82
-    drop_accuracy: 68.1
-    bbh_naive_average: 72.15
-    GPQA_diamond_accuracy: 32.83
-    hellaswag_accuracy: 88.36
-    TheoremQA_score: 25
-    winogrande_accuracy: 81.29
-    gsm8k_accuracy: 74.68
-    GaokaoBench_weighted_average: 58.19
-    math_accuracy: 33.98
-    Mathbench_naive_average: 48.38
-    wikibench-wiki-single_choice_cncircular_perf_4: 29.1
-    cmmlu_naive_average: 78.94
-    mmlu_naive_average: 71.44
-    mmlu_pro_naive_average: 38.18
-    openai_humaneval_humaneval_pass@1: 59.76
-    openai_humaneval_v2_humaneval_pass@1: 51.22
-    sanitized_mbpp_score: 55.25
-    dingo_en_192_score: 60.94
-    dingo_zh_170_score: 67.65
-    mmlu-stem_naive_average: 63.72
-    mmlu-social-science_naive_average: 80.15
-    mmlu-humanities_naive_average: 74.27
-    mmlu-other_naive_average: 71.85
-    cmmlu-stem_naive_average: 67.07
-    cmmlu-social-science_naive_average: 81.49
-    cmmlu-humanities_naive_average: 85.84
-    cmmlu-other_naive_average: 82.69
-    cmmlu-china-specific_naive_average: 79.88
-    mmlu_pro_biology_accuracy: 58.58
-    mmlu_pro_business_accuracy: 28.01
-    mmlu_pro_chemistry_accuracy: 22.79
-    mmlu_pro_computer_science_accuracy: 39.02
-    mmlu_pro_economics_accuracy: 53.08
-    mmlu_pro_engineering_accuracy: 25.7
-    mmlu_pro_health_accuracy: 46.94
-    mmlu_pro_history_accuracy: 43.04
-    mmlu_pro_law_accuracy: 29.7
-    mmlu_pro_math_accuracy: 24.2
-    mmlu_pro_philosophy_accuracy: 42.48
-    mmlu_pro_physics_accuracy: 26.02
-    mmlu_pro_psychology_accuracy: 52.76
-    mmlu_pro_other_accuracy: 42.21
-    college_naive_average: 10.67
-    high_naive_average: 6.67
-    middle_naive_average: 26.67
-    primary_naive_average: 60
-    arithmetic_naive_average: 55
-    mathbench-a (average)_naive_average: 31.8
-    college_knowledge_naive_average: 62.34
-    high_knowledge_naive_average: 59.83
-    middle_knowledge_naive_average: 71.15
-    primary_knowledge_naive_average: 66.55
-    mathbench-t (average)_naive_average: 64.97
-    Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
-    Single-Needle-Retrieval-EN-32000_naive_average: 100
-    Single-Needle-Retrieval-ZH-32000_naive_average: 100
-    Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
-    Single-Needle-Retrieval-EN-100000_naive_average: 100
-    Single-Needle-Retrieval-ZH-100000_naive_average: 100
-    Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
-    Single-Needle-Retrieval-EN-200000_naive_average: 100
-    Single-Needle-Retrieval-ZH-200000_naive_average: 100
-    longbench_naive_average: 46.19
-    longbench_zh_naive_average: 49.3
-    longbench_en_naive_average: 43.97
-    longbench_single-document-qa_naive_average: 42.84
-    longbench_multi-document-qa_naive_average: 37.29
-    longbench_summarization_naive_average: 23.21
-    longbench_few-shot-learning_naive_average: 61.67
-    longbench_synthetic-tasks_naive_average: 60.05
-    longbench_code-completion_naive_average: 52.09
+    objective:
+        race-high_accuracy: 89.28
+        ARC-c_accuracy: 52.2
+        BoolQ_accuracy: 89.72
+        triviaqa_wiki_1shot_score: 65.88
+        nq_open_1shot_score: 34.82
+        drop_accuracy: 68.1
+        bbh_naive_average: 72.15
+        GPQA_diamond_accuracy: 32.83
+        hellaswag_accuracy: 88.36
+        TheoremQA_score: 25
+        winogrande_accuracy: 81.29
+        gsm8k_accuracy: 74.68
+        GaokaoBench_weighted_average: 58.19
+        math_accuracy: 33.98
+        Mathbench_naive_average: 48.38
+        wikibench-wiki-single_choice_cncircular_perf_4: 29.1
+        cmmlu_naive_average: 78.94
+        mmlu_naive_average: 71.44
+        mmlu_pro_naive_average: 38.18
+        openai_humaneval_humaneval_pass@1: 59.76
+        openai_humaneval_v2_humaneval_pass@1: 51.22
+        sanitized_mbpp_score: 55.25
+        dingo_en_192_score: 60.94
+        dingo_zh_170_score: 67.65
+        mmlu-stem_naive_average: 63.72
+        mmlu-social-science_naive_average: 80.15
+        mmlu-humanities_naive_average: 74.27
+        mmlu-other_naive_average: 71.85
+        cmmlu-stem_naive_average: 67.07
+        cmmlu-social-science_naive_average: 81.49
+        cmmlu-humanities_naive_average: 85.84
+        cmmlu-other_naive_average: 82.69
+        cmmlu-china-specific_naive_average: 79.88
+        mmlu_pro_biology_accuracy: 58.58
+        mmlu_pro_business_accuracy: 28.01
+        mmlu_pro_chemistry_accuracy: 22.79
+        mmlu_pro_computer_science_accuracy: 39.02
+        mmlu_pro_economics_accuracy: 53.08
+        mmlu_pro_engineering_accuracy: 25.7
+        mmlu_pro_health_accuracy: 46.94
+        mmlu_pro_history_accuracy: 43.04
+        mmlu_pro_law_accuracy: 29.7
+        mmlu_pro_math_accuracy: 24.2
+        mmlu_pro_philosophy_accuracy: 42.48
+        mmlu_pro_physics_accuracy: 26.02
+        mmlu_pro_psychology_accuracy: 52.76
+        mmlu_pro_other_accuracy: 42.21
+        college_naive_average: 10.67
+        high_naive_average: 6.67
+        middle_naive_average: 26.67
+        primary_naive_average: 60
+        arithmetic_naive_average: 55
+        mathbench-a (average)_naive_average: 31.8
+        college_knowledge_naive_average: 62.34
+        high_knowledge_naive_average: 59.83
+        middle_knowledge_naive_average: 71.15
+        primary_knowledge_naive_average: 66.55
+        mathbench-t (average)_naive_average: 64.97
+    long_context:
+        Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
+        Single-Needle-Retrieval-EN-32000_naive_average: 100
+        Single-Needle-Retrieval-ZH-32000_naive_average: 100
+        Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
+        Single-Needle-Retrieval-EN-100000_naive_average: 100
+        Single-Needle-Retrieval-ZH-100000_naive_average: 100
+        Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
+        Single-Needle-Retrieval-EN-200000_naive_average: 100
+        Single-Needle-Retrieval-ZH-200000_naive_average: 100
+        longbench_naive_average: 46.19
+        longbench_zh_naive_average: 49.3
+        longbench_en_naive_average: 43.97
+        longbench_single-document-qa_naive_average: 42.84
+        longbench_multi-document-qa_naive_average: 37.29
+        longbench_summarization_naive_average: 23.21
+        longbench_few-shot-learning_naive_average: 61.67
+        longbench_synthetic-tasks_naive_average: 60.05
+        longbench_code-completion_naive_average: 52.09
 
 internlm2_5-7b-chat-turbomind:
-    race-high_accuracy: 86.16
-    ARC-c_accuracy: 90.17
-    BoolQ_accuracy: 87.89
-    triviaqa_wiki_1shot_score: 64.91
-    nq_open_1shot_score: 22.69
-    mmmlu_lite_naive_average: 44.96
-    IFEval_Prompt-level-strict-accuracy: 58.04
-    drop_accuracy: 77.68
-    bbh_naive_average: 73.14
-    GPQA_diamond_accuracy: 25.76
-    hellaswag_accuracy: 94.79
-    TheoremQA_score: 21.5
-    musr_average_naive_average: 51.03
-    korbench_single_naive_average: 31.92
-    ARC_Prize_Public_Evaluation_accuracy: 0.01
-    gsm8k_accuracy: 86.73
-    GaokaoBench_weighted_average: 77.89
-    math_accuracy: 61.5
-    cmo_fib_accuracy: 12.5
-    aime2024_accuracy: 3.33
-    Mathbench_naive_average: 65.17
-    wikibench-wiki-single_choice_cncircular_perf_4: 31.55
-    cmmlu_naive_average: 74.14
-    mmlu_naive_average: 70.52
-    mmlu_pro_naive_average: 44.98
-    openai_humaneval_humaneval_pass@1: 70.73
-    sanitized_mbpp_score: 63.81
-    humanevalx_naive_average: 38.17
-    ds1000_naive_average: 14.15
-    lcb_code_generation_pass@1: 17.75
-    lcb_code_execution_pass@1: 32.57
-    lcb_test_output_pass@1: 24.89
-    bigcodebench_hard_instruct_pass@1: 0.08
-    bigcodebench_hard_complete_pass@1: 0.06
-    teval_naive_average: 80.03
-    qa_dingo_cn_score: 99.01
-    mmlu-stem_naive_average: 68.2
-    mmlu-social-science_naive_average: 76.11
-    mmlu-humanities_naive_average: 68.71
-    mmlu-other_naive_average: 70.56
-    cmmlu-stem_naive_average: 66.27
-    cmmlu-social-science_naive_average: 75.7
-    cmmlu-humanities_naive_average: 77.7
-    cmmlu-other_naive_average: 77.71
-    cmmlu-china-specific_naive_average: 72.94
-    mmlu_pro_biology_accuracy: 66.25
-    mmlu_pro_business_accuracy: 48.42
-    mmlu_pro_chemistry_accuracy: 35.25
-    mmlu_pro_computer_science_accuracy: 47.56
-    mmlu_pro_economics_accuracy: 55.92
-    mmlu_pro_engineering_accuracy: 30.44
-    mmlu_pro_health_accuracy: 45.97
-    mmlu_pro_history_accuracy: 41.21
-    mmlu_pro_law_accuracy: 25.79
-    mmlu_pro_math_accuracy: 54.03
-    mmlu_pro_philosophy_accuracy: 36.47
-    mmlu_pro_physics_accuracy: 37.41
-    mmlu_pro_psychology_accuracy: 58.77
-    mmlu_pro_other_accuracy: 46.21
-    humanevalx-python_pass@1: 53.66
-    humanevalx-cpp_pass@1: 24.39
-    humanevalx-go_pass@1: 0
-    humanevalx-java_pass@1: 57.93
-    humanevalx-js_pass@1: 54.88
-    ds1000_Pandas_accuracy: 12.03
-    ds1000_Numpy_accuracy: 4.09
-    ds1000_Tensorflow_accuracy: 11.11
-    ds1000_Scipy_accuracy: 8.49
-    ds1000_Sklearn_accuracy: 6.96
-    ds1000_Pytorch_accuracy: 7.35
-    ds1000_Matplotlib_accuracy: 49.03
-    openai_mmmlu_lite_AR-XY_accuracy: 17.89
-    openai_mmmlu_lite_BN-BD_accuracy: 27.58
-    openai_mmmlu_lite_DE-DE_accuracy: 51.16
-    openai_mmmlu_lite_ES-LA_accuracy: 56.84
-    openai_mmmlu_lite_FR-FR_accuracy: 57.96
-    openai_mmmlu_lite_HI-IN_accuracy: 33.68
-    openai_mmmlu_lite_ID-ID_accuracy: 51.02
-    openai_mmmlu_lite_IT-IT_accuracy: 50.46
-    openai_mmmlu_lite_JA-JP_accuracy: 50.53
-    openai_mmmlu_lite_KO-KR_accuracy: 45.05
-    openai_mmmlu_lite_PT-BR_accuracy: 57.68
-    openai_mmmlu_lite_SW-KE_accuracy: 32.77
-    openai_mmmlu_lite_YO-NG_accuracy: 31.79
-    openai_mmmlu_lite_ZH-CN_accuracy: 65.05
-    college_naive_average: 20.33
-    high_naive_average: 47.67
-    middle_naive_average: 62
-    primary_naive_average: 72
-    arithmetic_naive_average: 62.33
-    mathbench-a (average)_naive_average: 52.87
-    college_knowledge_naive_average: 70.57
-    high_knowledge_naive_average: 70.13
-    middle_knowledge_naive_average: 81.17
-    primary_knowledge_naive_average: 88.01
-    mathbench-t (average)_naive_average: 77.47
-    alignment_bench_v1_1_总分: 5.68
-    alpaca_eval_total: 25.96
-    arenahard_score: 17.15
-    Followbench_naive_average: 0.81
-    CompassArena_naive_average: 34.61
-    FoFo_naive_average: 0.38
-    mtbench101_avg: 8.01
-    wildbench_average: -15.69
-    simpleqa_accuracy_given_attempted: 0.04
-    chinese_simpleqa_given_attempted_accuracy: 0.34
-    alignment_bench_v1_1_专业能力: 6.05
-    alignment_bench_v1_1_数学计算: 5.87
-    alignment_bench_v1_1_基本任务: 6.01
-    alignment_bench_v1_1_逻辑推理: 4.48
-    alignment_bench_v1_1_中文理解: 6.17
-    alignment_bench_v1_1_文本写作: 6.06
-    alignment_bench_v1_1_角色扮演: 6.3
-    alignment_bench_v1_1_综合问答: 6.45
-    alpaca_eval_helpful_base: 17.83
-    alpaca_eval_koala: 28.21
-    alpaca_eval_oasst: 23.4
-    alpaca_eval_selfinstruct: 30.95
-    alpaca_eval_vicuna: 25
-    compassarena_language_naive_average: 52.5
-    compassarena_knowledge_naive_average: 36
-    compassarena_reason_v2_naive_average: 35
-    compassarena_math_v2_naive_average: 19.91
-    compassarena_creationv2_zh_naive_average: 29.64
-    fofo_test_prompts_overall: 0.35
-    fofo_test_prompts_cn_overall: 0.41
-    followbench_llmeval_en_HSR_AVG: 0.73
-    followbench_llmeval_en_SSR_AVG: 0.88
-    followbench_llmeval_en_HSR_L1: 0.94
-    followbench_llmeval_en_HSR_L2: 0.77
-    followbench_llmeval_en_HSR_L3: 0.73
-    followbench_llmeval_en_HSR_L4: 0.68
-    followbench_llmeval_en_HSR_L5: 0.54
-    followbench_llmeval_en_SSR_L1: 0.94
-    followbench_llmeval_en_SSR_L2: 0.88
-    followbench_llmeval_en_SSR_L3: 0.87
-    followbench_llmeval_en_SSR_L4: 0.87
-    followbench_llmeval_en_SSR_L5: 0.85
-    simpleqa_f1: 0.04
+    objective:
+        race-high_accuracy: 86.16
+        ARC-c_accuracy: 90.17
+        BoolQ_accuracy: 87.89
+        triviaqa_wiki_1shot_score: 64.91
+        nq_open_1shot_score: 22.69
+        mmmlu_lite_naive_average: 44.96
+        IFEval_Prompt-level-strict-accuracy: 58.04
+        drop_accuracy: 77.68
+        bbh_naive_average: 73.14
+        GPQA_diamond_accuracy: 25.76
+        hellaswag_accuracy: 94.79
+        TheoremQA_score: 21.5
+        musr_average_naive_average: 51.03
+        korbench_single_naive_average: 31.92
+        ARC_Prize_Public_Evaluation_accuracy: 0.01
+        gsm8k_accuracy: 86.73
+        GaokaoBench_weighted_average: 77.89
+        math_accuracy: 61.5
+        cmo_fib_accuracy: 12.5
+        aime2024_accuracy: 3.33
+        Mathbench_naive_average: 65.17
+        wikibench-wiki-single_choice_cncircular_perf_4: 31.55
+        cmmlu_naive_average: 74.14
+        mmlu_naive_average: 70.52
+        mmlu_pro_naive_average: 44.98
+        openai_humaneval_humaneval_pass@1: 70.73
+        sanitized_mbpp_score: 63.81
+        humanevalx_naive_average: 38.17
+        ds1000_naive_average: 14.15
+        lcb_code_generation_pass@1: 17.75
+        lcb_code_execution_pass@1: 32.57
+        lcb_test_output_pass@1: 24.89
+        bigcodebench_hard_instruct_pass@1: 0.08
+        bigcodebench_hard_complete_pass@1: 0.06
+        teval_naive_average: 80.03
+        qa_dingo_cn_score: 99.01
+        mmlu-stem_naive_average: 68.2
+        mmlu-social-science_naive_average: 76.11
+        mmlu-humanities_naive_average: 68.71
+        mmlu-other_naive_average: 70.56
+        cmmlu-stem_naive_average: 66.27
+        cmmlu-social-science_naive_average: 75.7
+        cmmlu-humanities_naive_average: 77.7
+        cmmlu-other_naive_average: 77.71
+        cmmlu-china-specific_naive_average: 72.94
+        mmlu_pro_biology_accuracy: 66.25
+        mmlu_pro_business_accuracy: 48.42
+        mmlu_pro_chemistry_accuracy: 35.25
+        mmlu_pro_computer_science_accuracy: 47.56
+        mmlu_pro_economics_accuracy: 55.92
+        mmlu_pro_engineering_accuracy: 30.44
+        mmlu_pro_health_accuracy: 45.97
+        mmlu_pro_history_accuracy: 41.21
+        mmlu_pro_law_accuracy: 25.79
+        mmlu_pro_math_accuracy: 54.03
+        mmlu_pro_philosophy_accuracy: 36.47
+        mmlu_pro_physics_accuracy: 37.41
+        mmlu_pro_psychology_accuracy: 58.77
+        mmlu_pro_other_accuracy: 46.21
+        humanevalx-python_pass@1: 53.66
+        humanevalx-cpp_pass@1: 24.39
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 57.93
+        humanevalx-js_pass@1: 54.88
+        ds1000_Pandas_accuracy: 12.03
+        ds1000_Numpy_accuracy: 4.09
+        ds1000_Tensorflow_accuracy: 11.11
+        ds1000_Scipy_accuracy: 8.49
+        ds1000_Sklearn_accuracy: 6.96
+        ds1000_Pytorch_accuracy: 7.35
+        ds1000_Matplotlib_accuracy: 49.03
+        openai_mmmlu_lite_AR-XY_accuracy: 17.89
+        openai_mmmlu_lite_BN-BD_accuracy: 27.58
+        openai_mmmlu_lite_DE-DE_accuracy: 51.16
+        openai_mmmlu_lite_ES-LA_accuracy: 56.84
+        openai_mmmlu_lite_FR-FR_accuracy: 57.96
+        openai_mmmlu_lite_HI-IN_accuracy: 33.68
+        openai_mmmlu_lite_ID-ID_accuracy: 51.02
+        openai_mmmlu_lite_IT-IT_accuracy: 50.46
+        openai_mmmlu_lite_JA-JP_accuracy: 50.53
+        openai_mmmlu_lite_KO-KR_accuracy: 45.05
+        openai_mmmlu_lite_PT-BR_accuracy: 57.68
+        openai_mmmlu_lite_SW-KE_accuracy: 32.77
+        openai_mmmlu_lite_YO-NG_accuracy: 31.79
+        openai_mmmlu_lite_ZH-CN_accuracy: 65.05
+        college_naive_average: 20.33
+        high_naive_average: 47.67
+        middle_naive_average: 62
+        primary_naive_average: 72
+        arithmetic_naive_average: 62.33
+        mathbench-a (average)_naive_average: 52.87
+        college_knowledge_naive_average: 70.57
+        high_knowledge_naive_average: 70.13
+        middle_knowledge_naive_average: 81.17
+        primary_knowledge_naive_average: 88.01
+        mathbench-t (average)_naive_average: 77.47
+    subjective:
+        alignment_bench_v1_1_总分: 5.68
+        alpaca_eval_total: 25.96
+        arenahard_score: 17.15
+        Followbench_naive_average: 0.81
+        CompassArena_naive_average: 34.61
+        FoFo_naive_average: 0.38
+        mtbench101_avg: 8.01
+        wildbench_average: -15.69
+        simpleqa_accuracy_given_attempted: 0.04
+        chinese_simpleqa_given_attempted_accuracy: 0.34
+        alignment_bench_v1_1_专业能力: 6.05
+        alignment_bench_v1_1_数学计算: 5.87
+        alignment_bench_v1_1_基本任务: 6.01
+        alignment_bench_v1_1_逻辑推理: 4.48
+        alignment_bench_v1_1_中文理解: 6.17
+        alignment_bench_v1_1_文本写作: 6.06
+        alignment_bench_v1_1_角色扮演: 6.3
+        alignment_bench_v1_1_综合问答: 6.45
+        alpaca_eval_helpful_base: 17.83
+        alpaca_eval_koala: 28.21
+        alpaca_eval_oasst: 23.4
+        alpaca_eval_selfinstruct: 30.95
+        alpaca_eval_vicuna: 25
+        compassarena_language_naive_average: 52.5
+        compassarena_knowledge_naive_average: 36
+        compassarena_reason_v2_naive_average: 35
+        compassarena_math_v2_naive_average: 19.91
+        compassarena_creationv2_zh_naive_average: 29.64
+        fofo_test_prompts_overall: 0.35
+        fofo_test_prompts_cn_overall: 0.41
+        followbench_llmeval_en_HSR_AVG: 0.73
+        followbench_llmeval_en_SSR_AVG: 0.88
+        followbench_llmeval_en_HSR_L1: 0.94
+        followbench_llmeval_en_HSR_L2: 0.77
+        followbench_llmeval_en_HSR_L3: 0.73
+        followbench_llmeval_en_HSR_L4: 0.68
+        followbench_llmeval_en_HSR_L5: 0.54
+        followbench_llmeval_en_SSR_L1: 0.94
+        followbench_llmeval_en_SSR_L2: 0.88
+        followbench_llmeval_en_SSR_L3: 0.87
+        followbench_llmeval_en_SSR_L4: 0.87
+        followbench_llmeval_en_SSR_L5: 0.85
+        simpleqa_f1: 0.04
 
 internlm2_5-7b-chat-1m-turbomind:
-    ruler_8k_naive_average: 88.53
-    ruler_32k_naive_average: 83.84
-    ruler_128k_naive_average: 70.94
-    NeedleBench-Overall-Score-8K_weighted_average: 91.89
-    NeedleBench-Overall-Score-32K_weighted_average: 91.42
-    NeedleBench-Overall-Score-128K_weighted_average: 88.57
-    longbench_naive_average: 46.44
-    longbench_zh_naive_average: 45.19
-    longbench_en_naive_average: 45.71
-    babilong_0k_naive_average: 79.3
-    babilong_4k_naive_average: 67
-    babilong_16k_naive_average: 52.7
-    babilong_32k_naive_average: 48.9
-    babilong_128k_naive_average: 40.8
-    babilong_256k_naive_average: 23.5
-    longbench_single-document-qa_naive_average: 43.56
-    longbench_multi-document-qa_naive_average: 46.24
-    longbench_summarization_naive_average: 24.32
-    longbench_few-shot-learning_naive_average: 51.67
-    longbench_synthetic-tasks_naive_average: 66.83
-    longbench_code-completion_naive_average: 45.99
+    long_context:
+        ruler_8k_naive_average: 88.53
+        ruler_32k_naive_average: 83.84
+        ruler_128k_naive_average: 70.94
+        NeedleBench-Overall-Score-8K_weighted_average: 91.89
+        NeedleBench-Overall-Score-32K_weighted_average: 91.42
+        NeedleBench-Overall-Score-128K_weighted_average: 88.57
+        longbench_naive_average: 46.44
+        longbench_zh_naive_average: 45.19
+        longbench_en_naive_average: 45.71
+        babilong_0k_naive_average: 79.3
+        babilong_4k_naive_average: 67
+        babilong_16k_naive_average: 52.7
+        babilong_32k_naive_average: 48.9
+        babilong_128k_naive_average: 40.8
+        babilong_256k_naive_average: 23.5
+        longbench_single-document-qa_naive_average: 43.56
+        longbench_multi-document-qa_naive_average: 46.24
+        longbench_summarization_naive_average: 24.32
+        longbench_few-shot-learning_naive_average: 51.67
+        longbench_synthetic-tasks_naive_average: 66.83
+        longbench_code-completion_naive_average: 45.99
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index f1254343..131fd2ea 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -1,459 +1,468 @@
-baichuan2-7b-chat-hf:
-    gsm8k_accuracy: 18.75
-    race-high_accuracy: 78.12
-
-glm-4-9b-chat-hf:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 90.62
-
-glm-4-9b-chat-turbomind:
-    gsm8k_accuracy: 75.00
-    race-high_accuracy: 90.62
-
-glm-4-9b-chat-vllm:
-    gsm8k_accuracy: 65.62
-    race-high_accuracy: 90.62
-
-deepseek-7b-chat-hf:
-    gsm8k_accuracy: 46.88
-    race-high_accuracy: 81.25
-
-deepseek-moe-16b-chat-hf:
-    gsm8k_accuracy: 50
-    race-high_accuracy: 68.75
-
-deepseek-7b-chat-vllm:
-    gsm8k_accuracy: 43.75
-    race-high_accuracy: 75
-
-gemma2-2b-it-hf:
-    gsm8k_accuracy: 50
-    race-high_accuracy: 71.88
-
-gemma2-9b-it-hf:
-    gsm8k_accuracy: 71.88
-    race-high_accuracy: 84.38
-
-gemma-2b-it-hf:
-    gsm8k_accuracy: 3.12
-    race-high_accuracy: 40.62
-
-gemma-7b-it-hf:
-    gsm8k_accuracy: 40.62
-    race-high_accuracy: 68.75
-
-gemma-2-9b-it-turbomind:
-    gsm8k_accuracy: 65.62
-    race-high_accuracy: 84.38
-
-gemma-7b-it-vllm:
-    gsm8k_accuracy: 34.38
-    race-high_accuracy: 68.75
-
-internlm2_5-7b-chat-hf:
-    gsm8k_accuracy: 84.38
-    race-high_accuracy: 90.62
-
-internlm2_5-7b-chat-turbomind:
-    gsm8k_accuracy: 84.38
-    race-high_accuracy: 90.62
-
-internlm2-chat-1.8b-turbomind:
-    gsm8k_accuracy: 25
-    race-high_accuracy: 84.38
-
-internlm2-chat-1.8b-sft-turbomind:
-    gsm8k_accuracy: 21.88
-    race-high_accuracy: 84.38
-
-internlm2-chat-7b-lmdeploy:
-    gsm8k_accuracy: 53.12
-    race-high_accuracy: 84.38
-
-internlm2-chat-7b-sft-turbomind:
-    gsm8k_accuracy: 50
-    race-high_accuracy: 90.62
-
-internlm2-chat-7b-vllm:
-    gsm8k_accuracy: 43.75
-    race-high_accuracy: 87.5
-
-llama-3_1-8b-instruct-hf:
-    gsm8k_accuracy: 84.38
-    race-high_accuracy: 90.62
-
-llama-3_2-3b-instruct-hf:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 81.25
-
-llama-3-8b-instruct-hf:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 87.5
-
-llama-3_1-8b-instruct-turbomind:
-    gsm8k_accuracy: 78.12
-    race-high_accuracy: 90.62
-
-llama-3_2-3b-instruct-turbomind:
-    gsm8k_accuracy: 65.62
-    race-high_accuracy: 81.25
-
-llama-3-8b-instruct-turbomind:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 87.5
-
-mistral-7b-instruct-v0.2-hf:
-    gsm8k_accuracy: 40.62
-    race-high_accuracy: 75
-
-mistral-7b-instruct-v0.3-hf:
-    gsm8k_accuracy: 40.62
-    race-high_accuracy: 75
-
-mistral-nemo-instruct-2407-hf:
-    gsm8k_accuracy: 75
-    race-high_accuracy: 81.25
-
-mistral-nemo-instruct-2407-turbomind:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 87.50
-
-mistral-7b-instruct-v0.1-vllm:
-    gsm8k_accuracy: 34.38
-    race-high_accuracy: 68.75
-
-mistral-7b-instruct-v0.2-vllm:
-    gsm8k_accuracy: 43.75
-    race-high_accuracy: 75
-
-MiniCPM3-4B-hf:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 84.38
-
-minicpm-2b-dpo-fp32-hf:
-    gsm8k_accuracy: 56.25
-    race-high_accuracy: 53.12
-
-minicpm-2b-sft-bf16-hf:
-    gsm8k_accuracy: 46.88
-    race-high_accuracy: 65.62
-
-minicpm-2b-sft-fp32-hf:
-    gsm8k_accuracy: 46.88
-    race-high_accuracy: 65.62
-
-phi-3-mini-4k-instruct-hf:
-    gsm8k_accuracy: 56.25
-    race-high_accuracy: 84.38
-
-qwen1.5-0.5b-chat-hf:
-    gsm8k_accuracy: 0
-    race-high_accuracy: 53.12
-
-qwen2-1.5b-instruct-hf:
-    gsm8k_accuracy: 62.5
-    race-high_accuracy: 84.38
-
-qwen2-7b-instruct-hf:
-    gsm8k_accuracy: 68.75
-    race-high_accuracy: 90.62
-
-qwen2-1.5b-instruct-turbomind:
-    gsm8k_accuracy: 62.50
-    race-high_accuracy: 84.38
-
-qwen2-7b-instruct-turbomind:
-    gsm8k_accuracy: 81.25
-    race-high_accuracy: 87.5
-
-qwen1.5-0.5b-chat-vllm:
-    gsm8k_accuracy: 3.12
-    race-high_accuracy: 53.12
-
-yi-1.5-6b-chat-hf:
-    gsm8k_accuracy: 65.62
-    race-high_accuracy: 84.38
-
-yi-1.5-9b-chat-hf:
-    gsm8k_accuracy: 75
-    race-high_accuracy: 93.75
-
-deepseek-v2-lite-chat-hf:
-    gsm8k_accuracy: 43.75
-    race-high_accuracy: 71.88
-
-internlm2_5-20b-chat-hf:
-    gsm8k_accuracy: 84.38
-    race-high_accuracy: 87.5
-
-internlm2_5-20b-chat-turbomind:
-    gsm8k_accuracy: 84.38
-    race-high_accuracy: 87.5
-
-mistral-small-instruct-2409-hf:
-    gsm8k_accuracy: 81.25
-    race-high_accuracy: 87.50
-
-mistral-small-instruct-2409-turbomind:
-    gsm8k_accuracy: 78.12
-    race-high_accuracy: 87.50
-
-qwen2.5-14b-instruct-hf:
-    gsm8k_accuracy: 71.88
-    race-high_accuracy: 96.88
-
-qwen2.5-14b-instruct-turbomind:
-    gsm8k_accuracy: 71.88
-    race-high_accuracy: 93.75
-
-glm-4-9b-hf:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 31.25
-    race-high_accuracy: 93.75
-    winogrande_accuracy: 84.38
-
-deepseek-moe-16b-base-hf:
-    gsm8k_accuracy: 21.88
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 21.88
-    winogrande_accuracy: 65.62
-
-deepseek-7b-base-turbomind:
-    gsm8k_accuracy: 21.88
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 46.88
-    winogrande_accuracy: 84.38
-
-deepseek-moe-16b-base-vllm:
-    gsm8k_accuracy: 21.88
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 25
-    winogrande_accuracy: 68.75
-
-gemma2-2b-hf:
-    gsm8k_accuracy: 31.25
-    GPQA_diamond_accuracy: 3.12
-    race-high_accuracy: 56.25
-    winogrande_accuracy: 71.88
-
-gemma2-9b-hf:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 81.25
-    winogrande_accuracy: 84.38
-
-gemma-2b-hf:
-    gsm8k_accuracy: 18.75
-    GPQA_diamond_accuracy: 3.12
-    race-high_accuracy: 25
-    winogrande_accuracy: 53.12
-
-gemma-7b-hf:
-    gsm8k_accuracy: 56.25
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 65.62
-    winogrande_accuracy: 78.12
-
-gemma-2b-vllm:
-    gsm8k_accuracy: 15.62
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy:
-    winogrande_accuracy:
-
-gemma-7b-vllm:
-    gsm8k_accuracy: 53.12
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy:
-    winogrande_accuracy:
-
-internlm2_5-7b-hf:
-    gsm8k_accuracy: 37.5
-    GPQA_diamond_accuracy: 25
-    race-high_accuracy: 93.75
-    winogrande_accuracy: 71.88
-
-internlm2-7b-hf:
-    gsm8k_accuracy: 53.12
-    GPQA_diamond_accuracy: 18.75
-    race-high_accuracy: 62.5
-    winogrande_accuracy: 78.12
-
-internlm2-base-7b-hf:
-    gsm8k_accuracy: 3.12
-    GPQA_diamond_accuracy: 21.88
-    race-high_accuracy: 75
-    winogrande_accuracy: 65.62
-
-internlm2-1.8b-turbomind:
-    gsm8k_accuracy: 12.5
-    GPQA_diamond_accuracy: 12.5
-    race-high_accuracy: 71.88
-    winogrande_accuracy: 75
-
-internlm2_5-7b-turbomind:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 31.25
-    race-high_accuracy: 93.75
-    winogrande_accuracy: 84.38
-
-internlm2-7b-turbomind:
-    gsm8k_accuracy: 56.25
-    GPQA_diamond_accuracy: 21.88
-    race-high_accuracy: 75
-    winogrande_accuracy: 81.25
-
-internlm2-base-7b-turbomind:
-    gsm8k_accuracy: 40.62
-    GPQA_diamond_accuracy: 28.12
-    race-high_accuracy: 84.38
-    winogrande_accuracy: 71.88
-
-llama-2-7b-hf:
-    gsm8k_accuracy: 21.88
-    GPQA_diamond_accuracy: 21.88
-    race-high_accuracy: 40.62
-    winogrande_accuracy: 71.88
-
-llama-3_1-8b-hf:
-    gsm8k_accuracy: 78.12
-    GPQA_diamond_accuracy: 25
-    race-high_accuracy: 90.62
-    winogrande_accuracy: 62.5
-
-llama-3-8b-hf:
-    gsm8k_accuracy: 46.88
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 65.62
-    winogrande_accuracy: 65.62
-
-llama-3.1-8b-turbomind:
-    gsm8k_accuracy: 56.25
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 78.12
-    winogrande_accuracy: 78.12
-
-llama-3-8b-turbomind:
-    gsm8k_accuracy: 50
-    GPQA_diamond_accuracy: 9.38
-    race-high_accuracy: 65.62
-    winogrande_accuracy: 78.12
-
-mistral-7b-v0.2-hf:
-    gsm8k_accuracy: 31.25
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 62.5
-    winogrande_accuracy: 59.38
-
-mistral-7b-v0.3-hf:
-    gsm8k_accuracy: 31.25
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 62.5
-    winogrande_accuracy: 59.38
-
-mistral-7b-v0.2-vllm:
-    gsm8k_accuracy: 34.38
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 62.5
-    winogrande_accuracy: 65.62
-
-qwen2.5-7b-hf:
-    gsm8k_accuracy: 81.25
-    GPQA_diamond_accuracy: 18.75
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 71.88
-
-qwen2.5-1.5b-turbomind:
-    gsm8k_accuracy: 71.88
-    GPQA_diamond_accuracy: 15.62
-    race-high_accuracy: 78.12
-    winogrande_accuracy: 71.88
-
-qwen2.5-7b-turbomind:
-    gsm8k_accuracy: 71.88
-    GPQA_diamond_accuracy: 25
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 71.88
-
-qwen1.5-moe-a2.7b-hf:
-    gsm8k_accuracy: 62.5
-    GPQA_diamond_accuracy: 18.75
-    race-high_accuracy: 84.38
-    winogrande_accuracy: 75
-
-qwen2-0.5b-hf:
-    gsm8k_accuracy: 25
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 40.62
-    winogrande_accuracy: 62.5
-
-qwen2-1.5b-hf:
-    gsm8k_accuracy: 59.38
-    GPQA_diamond_accuracy: 9.38
-    race-high_accuracy: 81.25
-    winogrande_accuracy: 62.5
-
-qwen2-7b-hf:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 9.38
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 68.75
-
-qwen2-1.5b-turbomind:
-    gsm8k_accuracy: 62.50
-    GPQA_diamond_accuracy: 6.25
-    race-high_accuracy: 81.25
-    winogrande_accuracy: 75
-
-qwen2-7b-turbomind:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 12.5
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 71.88
-
-qwen1.5-0.5b-vllm:
-    gsm8k_accuracy: 9.38
-    GPQA_diamond_accuracy: 0
-    race-high_accuracy: 56.25
-    winogrande_accuracy: 62.5
-
-yi-1.5-6b-hf:
-    gsm8k_accuracy: 62.5
-    GPQA_diamond_accuracy: 3.12
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 62.5
-
-yi-1.5-9b-hf:
-    gsm8k_accuracy: 75
-    GPQA_diamond_accuracy: 40.62
-    race-high_accuracy: 87.5
-    winogrande_accuracy: 59.38
-
-deepseek-v2-lite-hf:
-    gsm8k_accuracy: 28.12
-    GPQA_diamond_accuracy: 21.88
-    race-high_accuracy: 59.38
-    winogrande_accuracy: 75
-
-internlm2-20b-hf:
-    gsm8k_accuracy: 56.25
-    GPQA_diamond_accuracy: 15.62
-    race-high_accuracy: 68.75
-    winogrande_accuracy: 75
-
-internlm2-base-20b-hf:
-    gsm8k_accuracy: 12.5
-    GPQA_diamond_accuracy: 9.38
-    race-high_accuracy: 84.38
-    winogrande_accuracy: 65.62
-
-internlm2-20b-turbomind:
-    gsm8k_accuracy: 68.75
-    GPQA_diamond_accuracy: 15.62
-    race-high_accuracy: 68.75
-    winogrande_accuracy: 81.25
-
-qwen2.5-14b-hf:
-    gsm8k_accuracy: 75
-    GPQA_diamond_accuracy: 37.5
-    race-high_accuracy: 93.75
-    winogrande_accuracy: 84.38
+chat:
+    glm-4-9b-chat-hf:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 90.62
+    glm-4-9b-chat-turbomind:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 90.62
+    glm-4-9b-chat-vllm:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 90.62
+    deepseek-7b-chat-hf:
+        gsm8k_accuracy: 46.88
+        race-high_accuracy: 81.25
+    deepseek-moe-16b-chat-hf:
+        gsm8k_accuracy: 50
+        race-high_accuracy: 68.75
+    deepseek-7b-chat-vllm:
+        gsm8k_accuracy: 43.75
+        race-high_accuracy: 75
+    gemma2-2b-it-hf:
+        gsm8k_accuracy: 50
+        race-high_accuracy: 71.88
+    gemma2-9b-it-hf:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 84.38
+    gemma-2b-it-hf:
+        gsm8k_accuracy: 3.12
+        race-high_accuracy: 40.62
+    gemma-7b-it-hf:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 68.75
+    gemma-2-9b-it-turbomind:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 84.38
+    gemma-2-27b-it-turbomind:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 93.75
+    gemma-7b-it-vllm:
+        gsm8k_accuracy: 34.38
+        race-high_accuracy: 68.75
+    internlm2_5-7b-chat-hf:
+        gsm8k_accuracy: 84.38
+        race-high_accuracy: 90.62
+    internlm2_5-7b-chat-turbomind:
+        gsm8k_accuracy: 87.50
+        race-high_accuracy: 90.62
+    internlm2-chat-1.8b-turbomind:
+        gsm8k_accuracy: 28.12
+        race-high_accuracy: 84.38
+    internlm2-chat-1.8b-sft-turbomind:
+        gsm8k_accuracy: 21.88
+        race-high_accuracy: 84.38
+    internlm2-chat-7b-lmdeploy:
+        gsm8k_accuracy: 53.12
+        race-high_accuracy: 84.38
+    internlm2-chat-7b-sft-turbomind:
+        gsm8k_accuracy: 53.12
+        race-high_accuracy: 90.62
+    internlm2-chat-7b-vllm:
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
+    llama-3_1-8b-instruct-hf:
+        gsm8k_accuracy: 84.38
+        race-high_accuracy: 90.62
+    llama-3_2-3b-instruct-hf:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 81.25
+    llama-3-8b-instruct-hf:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 87.5
+    llama-2-7b-chat-turbomind:
+        gsm8k_accuracy: 18.75
+        race-high_accuracy: 46.88
+    llama-3_1-8b-instruct-turbomind:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 90.62
+    llama-3_2-3b-instruct-turbomind:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 81.25
+    llama-3-8b-instruct-turbomind:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 87.5
+    mistral-7b-instruct-v0.2-hf:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 75
+    mistral-7b-instruct-v0.3-hf:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 75
+    mistral-nemo-instruct-2407-hf:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 81.25
+    mistral-nemo-instruct-2407-turbomind:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 87.50
+    mistral-7b-instruct-v0.1-vllm:
+        gsm8k_accuracy: 34.38
+        race-high_accuracy: 68.75
+    mistral-7b-instruct-v0.2-vllm:
+        gsm8k_accuracy: 43.75
+        race-high_accuracy: 75
+    MiniCPM3-4B-hf:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 84.38
+    phi-3-mini-4k-instruct-hf:
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
+    phi-3-small-8k-instruct-hf:
+        gsm8k_accuracy: 0
+        race-high_accuracy: 0
+    qwen2.5-0.5b-instruct-hf:
+        gsm8k_accuracy: 34.38
+        race-high_accuracy: 46.88
+    qwen2.5-3b-instruct-hf :
+        gsm8k_accuracy: 53.12
+        race-high_accuracy: 90.62
+    qwen2.5-0.5b-instruct-turbomind:
+        gsm8k_accuracy: 28.12
+        race-high_accuracy: 50
+    qwen2.5-3b-instruct-turbomind:
+        gsm8k_accuracy: 59.38
+        race-high_accuracy: 90.62
+    qwen1.5-0.5b-chat-hf:
+        gsm8k_accuracy: 0
+        race-high_accuracy: 53.12
+    qwen2-1.5b-instruct-hf:
+        gsm8k_accuracy: 62.5
+        race-high_accuracy: 84.38
+    qwen2-7b-instruct-hf:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 90.62
+    qwen2-1.5b-instruct-turbomind:
+        gsm8k_accuracy: 53.12
+        race-high_accuracy: 84.38
+    qwen2-7b-instruct-turbomind:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 90.62
+    qwen1.5-0.5b-chat-vllm:
+        gsm8k_accuracy: 3.12
+        race-high_accuracy: 53.12
+    yi-1.5-6b-chat-hf:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 84.38
+    yi-1.5-9b-chat-hf:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 93.75
+    yi-1.5-6b-chat-turbomind:
+        gsm8k_accuracy: 62.5
+        race-high_accuracy: 84.38
+    yi-1.5-9b-chat-turbomind:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 93.75
+    deepseek-v2-lite-chat-hf:
+        gsm8k_accuracy: 46.88
+        race-high_accuracy: 71.88
+    gemma2-27b-it-hf:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 93.75
+    internlm2_5-20b-chat-hf:
+        gsm8k_accuracy: 84.38
+        race-high_accuracy: 87.5
+    internlm2_5-20b-chat-turbomind:
+        gsm8k_accuracy: 87.50
+        race-high_accuracy: 87.5
+    mistral-small-instruct-2409-hf:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
+    mistral-small-instruct-2409-turbomind:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
+    qwen2.5-14b-instruct-hf:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 96.88
+    qwen2.5-14b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 93.75
+    yi-1.5-34b-chat-turbomind:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 93.75
+    deepseek-67b-chat-hf:
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 78.12
+    llama-3_3-70b-instruct-turbomind:
+        gsm8k_accuracy: 93.75
+        race-high_accuracy: 87.5
+    mixtral-8x7b-instruct-v0.1-hf:
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 81.25
+    mixtral-large-instruct-2411-turbomind:
+        gsm8k_accuracy: 90.62
+        race-high_accuracy: 93.75
+    nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
+        gsm8k_accuracy: 87.5
+        race-high_accuracy: 46.88
+    qwen2.5-72b-instruct-turbomind:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 93.75
+    deepseek-v2_5-1210-turbomind:
+        gsm8k_accuracy: 90.62
+        race-high_accuracy: 84.38
+    mixtral-8x22b-instruct-v0.1-hf:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 81.25
+base:
+    glm-4-9b-hf:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
+    glm-4-9b-turbomind:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 28.12
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
+    deepseek-7b-base-hf:
+        gsm8k_accuracy: 25
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 46.88
+        winogrande_accuracy: 71.88
+    deepseek-moe-16b-base-hf:
+        gsm8k_accuracy: 21.88
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 21.88
+        winogrande_accuracy: 65.62
+    deepseek-7b-base-turbomind:
+        gsm8k_accuracy: 21.88
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 46.88
+        winogrande_accuracy: 84.38
+    deepseek-moe-16b-base-vllm:
+        gsm8k_accuracy: 21.88
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 25
+        winogrande_accuracy: 68.75
+    gemma2-2b-hf:
+        gsm8k_accuracy: 28.12
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 56.25
+        winogrande_accuracy: 71.88
+    gemma2-9b-hf:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 84.38
+    gemma-2b-hf:
+        gsm8k_accuracy: 18.75
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 25
+        winogrande_accuracy: 53.12
+    gemma-7b-hf:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 65.62
+        winogrande_accuracy: 78.12
+    gemma-2b-vllm:
+        gsm8k_accuracy: 15.62
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy:
+        winogrande_accuracy:
+    gemma-7b-vllm:
+        gsm8k_accuracy: 53.12
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy:
+        winogrande_accuracy:
+    internlm2_5-7b-hf:
+        gsm8k_accuracy: 37.5
+        GPQA_diamond_accuracy: 25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 71.88
+    internlm2-7b-hf:
+        gsm8k_accuracy: 53.12
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 62.5
+        winogrande_accuracy: 78.12
+    internlm2-base-7b-hf:
+        gsm8k_accuracy: 3.12
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 75
+        winogrande_accuracy: 65.62
+    internlm2-1.8b-turbomind:
+        gsm8k_accuracy: 12.5
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 71.88
+        winogrande_accuracy: 78.12
+    internlm2_5-7b-turbomind:
+        gsm8k_accuracy: 62.50
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 87.50
+    internlm2-7b-turbomind:
+        gsm8k_accuracy: 53.12
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 71.88
+        winogrande_accuracy: 84.38
+    internlm2-base-7b-turbomind:
+        gsm8k_accuracy: 37.50
+        GPQA_diamond_accuracy: 28.12
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 75
+    llama-2-7b-hf:
+        gsm8k_accuracy: 21.88
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 40.62
+        winogrande_accuracy: 71.88
+    llama-3_1-8b-hf:
+        gsm8k_accuracy: 78.12
+        GPQA_diamond_accuracy: 25
+        race-high_accuracy: 90.62
+        winogrande_accuracy: 62.5
+    llama-3-8b-hf:
+        gsm8k_accuracy: 46.88
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 65.62
+        winogrande_accuracy: 65.62
+    llama-3.1-8b-turbomind:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 78.12
+    llama-3-8b-turbomind:
+        gsm8k_accuracy: 50
+        GPQA_diamond_accuracy: 12.50
+        race-high_accuracy: 65.62
+        winogrande_accuracy: 78.12
+    mistral-7b-v0.2-hf:
+        gsm8k_accuracy: 31.25
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 62.5
+        winogrande_accuracy: 59.38
+    mistral-7b-v0.3-hf:
+        gsm8k_accuracy: 31.25
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 62.5
+        winogrande_accuracy: 59.38
+    mistral-7b-v0.2-vllm:
+        gsm8k_accuracy: 34.38
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 62.5
+        winogrande_accuracy: 65.62
+    qwen2.5-7b-hf:
+        gsm8k_accuracy: 81.25
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 71.88
+    qwen2.5-1.5b-turbomind:
+        gsm8k_accuracy: 62.50
+        GPQA_diamond_accuracy: 12.50
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 68.75
+    qwen2.5-7b-turbomind:
+        gsm8k_accuracy: 75.00
+        GPQA_diamond_accuracy: 25
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 71.88
+    qwen1.5-moe-a2.7b-hf:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 75
+    qwen2-0.5b-hf:
+        gsm8k_accuracy: 25
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 40.62
+        winogrande_accuracy: 62.5
+    qwen2-1.5b-hf:
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 62.5
+    qwen2-7b-hf:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 68.75
+    qwen2-1.5b-turbomind:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 75
+    qwen2-7b-turbomind:
+        gsm8k_accuracy: 75.00
+        GPQA_diamond_accuracy: 12.5
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 71.88
+    qwen1.5-0.5b-vllm:
+        gsm8k_accuracy: 9.38
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 56.25
+        winogrande_accuracy: 62.5
+    yi-1.5-6b-hf:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 62.5
+    yi-1.5-9b-hf:
+        gsm8k_accuracy: 75
+        GPQA_diamond_accuracy: 40.62
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 59.38
+    yi-1.5-9b-turbomind:
+        gsm8k_accuracy: 78.12
+        GPQA_diamond_accuracy: 40.62
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 71.88
+    deepseek-v2-lite-hf:
+        gsm8k_accuracy: 31.25
+        GPQA_diamond_accuracy: 28.12
+        race-high_accuracy: 59.38
+        winogrande_accuracy: 71.88
+    internlm2-20b-hf:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 15.62
+        race-high_accuracy: 68.75
+        winogrande_accuracy: 75
+    internlm2-base-20b-hf:
+        gsm8k_accuracy: 12.5
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 65.62
+    internlm2-20b-turbomind:
+        gsm8k_accuracy: 71.88
+        GPQA_diamond_accuracy: 15.62
+        race-high_accuracy: 68.75
+        winogrande_accuracy: 81.25
+    qwen2.5-14b-hf:
+        gsm8k_accuracy: 75
+        GPQA_diamond_accuracy: 37.5
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
+    qwen2.5-32b-hf:
+        gsm8k_accuracy: 87.5
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 78.12
+    qwen2.5-32b-turbomind:
+        gsm8k_accuracy: 84.38
+        GPQA_diamond_accuracy: 28.12
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 81.25
+    deepseek-67b-base-hf:
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 90.62
+    deepseek-67b-base-turbomind:
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 28.12
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 84.38
+    llama-3-70b-turbomind:
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 9.38
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
+    qwen2.5-72b-turbomind:
+        gsm8k_accuracy: 84.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 87.5
+    deepseek-v2-turbomind:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 68.75
+    llama-3-70b-hf:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 35614e6a..4ea85c19 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -28,21 +28,21 @@ on:
         description: 'Set branch or tag or commit id. Default is "main"'
         type: string
         default: 'main'
-      regression_func:
+      regression_func_volc:
         required: true
         description: 'regression functions'
         type: string
-        default: "['chat_models','base_models', 'chat_obj_fullbench', 'chat_sub_fullbench', 'base_fullbench','cmd', 'api']"
-      cuda_env:
+        default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
+      regression_func_local:
         required: true
-        description: "regression conda env, eg. ['dsw_cu11','dsw_cu12']"
+        description: 'regression functions'
         type: string
-        default: "['dsw_cu12']"
+        default: "['cmd', 'api', 'chat_sub_fullbench']"
       fullbench_eval:
         required: true
         description: 'fullbench volc functions'
         type: string
-        default: "['base_long_context','base_objective','chat_long_context','chat_objective','chat_subjective']"
+        default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
   schedule:
     - cron:  '15 14 * * *'
 
@@ -54,6 +54,13 @@ env:
   LMDEPLOY_USE_MODELSCOPE: false
   HF_HUB_OFFLINE: 1
   OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
+  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
+  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
+  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
+  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
+  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
+  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
+  CONDA_ENV: regression_test
 
 jobs:
   build-pypi:
@@ -117,14 +124,7 @@ jobs:
   prepare_env:
     if: ${{!cancelled()}}
     needs: ['build-pypi', 'build-pypi-lmdeploy']
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
-    runs-on: ${{ matrix.cuda_env }}
-    env:
-      CONDA_ENV: opencompass_regression
-      PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
+    runs-on: volc_cu12
     environment: 'prod'
     timeout-minutes: 240 #4hours
     steps:
@@ -140,79 +140,52 @@ jobs:
       - name:  Remove Conda Env
         if: always()
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
+          conda env remove -y --name ${{env.CONDA_ENV}}
           conda info --envs
-      - name: Prepare - create conda env and install torch - cu11
-        if: ${{matrix.cuda_env == 'dsw_cu11'}}
-        uses: nick-fields/retry@v3
-        id: retry1
-        with:
-          max_attempts: 3
-          timeout_minutes: 40
-          command: |
-            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
-            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu11.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.1+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip uninstall torch torchvision torchaudio -y
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu11torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            conda info --envs
-            pip list
       - name: Prepare - create conda env and install torch - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12'}}
         uses: nick-fields/retry@v3
-        id: retry2
         with:
           max_attempts: 3
-          timeout_minutes: 40
+          timeout_minutes: 240
           command: |
-            . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-            conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
-            conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-            pip install -r /cpfs01/shared/public/qa-llm-cicd/requirements-cu12.txt --cache-dir ${{env.PIP_CACHE_PATH}}
+            . ${{env.CONDA_PATH}}/bin/activate
+            conda create -y --name ${{env.CONDA_ENV}} python=3.10
+            conda activate ${{env.CONDA_ENV}}
+            pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            conda info --envs
-            pip list
+            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+            pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
+            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        if: ${{inputs.build_lmdeploy}}
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+        if: ${{inputs.build_lmdeploy}}
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           pip install lmdeploy-*.whl --no-deps
+      - name: conda env
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          pip list
 
-  daily_run_test:
+  daily_run_test_volc:
     if: ${{!cancelled()}}
     needs: prepare_env
     strategy:
       fail-fast: false
       matrix:
-        cuda_env: ${{ fromJSON(inputs.cuda_env || '["dsw_cu12"]')}}
-        regression_func: ${{fromJSON(github.event.inputs.regression_func || '["chat_models","base_models","chat_obj_fullbench","chat_sub_fullbench","base_fullbench","cmd","api"]')}}
-    runs-on: ${{ matrix.cuda_env }}
-    env:
-      CONDA_ENV: opencompass_regression
-      PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
-      HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
-      HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
-      HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
-      COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache
-      REPORT_ROOT: /cpfs01/shared/public/qa-llm-cicd/report
+        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
+    runs-on: volc_cu12_daily
     environment: 'prod'
     timeout-minutes: 240 #4hours
     steps:
@@ -221,105 +194,114 @@ jobs:
         with:
           repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Prepare - prepare data and hf model
+      - name: conda env
         run: |
-          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
-          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          pip list
+      - name:  modify config
+        if: matrix.regression_func != 'chat_sub_fullbench'
+        run: |
+          cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
+          cat /fs-computility/llm/qa-llm-cicd/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
+      - name:  Run test
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          timeout_minutes: 40
+          command: |
+            . ${{env.CONDA_PATH}}/bin/activate
+            conda activate ${{env.CONDA_ENV}}
+            conda info --envs
+            opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
+            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
+            python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
+
+
+  daily_run_test_local:
+    if: ${{!cancelled()}}
+    needs: prepare_env
+    strategy:
+      fail-fast: false
+      matrix:
+        regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
+    runs-on: volc_cu12_local
+    environment: 'prod'
+    timeout-minutes: 240 #4hours
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: conda env
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          pip list
+      - name:  modify config
+        if: matrix.regression_func == 'chat_sub_fullbench'
+        run: |
+          cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py .
+          cat /fs-computility/llm/qa-llm-cicd/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
       - name:  Run command testcase
         if: matrix.regression_func == 'cmd'
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
           python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
           python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
           python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Run chat model test
-        if: matrix.regression_func == 'chat_models'
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          conda info --envs
-          opencompass .github/scripts/eval_regression_chat.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
-          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Run base model test
-        if: matrix.regression_func == 'base_models'
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          conda info --envs
-          opencompass .github/scripts/eval_regression_base.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
-          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Run chat model test - fullbench
-        if: matrix.regression_func == 'chat_obj_fullbench'
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          conda info --envs
-          opencompass .github/scripts/eval_regression_chat_objective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_obj_${{ matrix.cuda_env }}/*/summary regression_result_daily
-          python -m pytest -m chat_obj_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Run chat model test - fullbench
-        if: matrix.regression_func == 'chat_sub_fullbench'
-        env:
-          COMPASS_DATA_CACHE: /cpfs01/shared/public/llmeval/compass_data_cache_subset
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          conda info --envs
-          opencompass .github/scripts/eval_regression_chat_subjective_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/chat_sub_${{ matrix.cuda_env }}/*/summary regression_result_daily
-          python -m pytest -m chat_sub_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
-      - name:  Run base model test - fullbench
-        if: matrix.regression_func == 'base_fullbench'
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
-          conda info --envs
-          opencompass .github/scripts/eval_regression_base_fullbench.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/base_full_${{ matrix.cuda_env }}/*/summary regression_result_daily
-          python -m pytest -m base_fullbench -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run model test - api
         if: matrix.regression_func == 'api'
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           conda info --envs
           lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 120s
-          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }} --reuse --max-num-workers 2 --dump-eval-details
-          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api_${{ matrix.cuda_env }}/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
           python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run model test - api kill
         if: always() && matrix.regression_func == 'api'
         run: |
           kill -15 "$restful_pid"
+      - name:  Run testcase
+        if: matrix.regression_func == 'chat_sub_fullbench'
+        env:
+          COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache_subset
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          export from_tf=TRUE
+          opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
+          python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
 
   fullbench_run_test:
     if: ${{!cancelled()}}
-    needs: ['build-pypi', 'build-pypi-lmdeploy']
-    env:
-      FULLBENCH_CONDA_ENV: regression_test
-      FULLBENCH_REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/regression
-      COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
+    needs: prepare_env
     strategy:
       fail-fast: false
       matrix:
-        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_long_context","base_objective","chat_long_context","chat_objective","chat_subjective"]')}}
+        function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
     runs-on: volc_cu12
     environment: 'prod'
     timeout-minutes: 360 #6hours
@@ -329,48 +311,30 @@ jobs:
         with:
           repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Download Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}
-      - name: Prepare - reinstall opencompass - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
+      - name: conda env
         run: |
-          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.FULLBENCH_CONDA_ENV}}
-          pip install opencompass*.whl --no-deps
-      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
-      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
-        run: |
-          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.FULLBENCH_CONDA_ENV}}
-          pip install lmdeploy-*.whl --no-deps
-      - name: Conda env
-        if: ${{matrix.cuda_env == 'dsw_cu12' && inputs.build_lmdeploy}}
-        run: |
-          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.FULLBENCH_CONDA_ENV}}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           conda info --envs
           pip list
-      - name:  Run command testcase
-        run: |
-          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.FULLBENCH_CONDA_ENV}}
-          conda info --envs
-          export from_tf=TRUE
-          opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
-          rm regression_result_daily -f && ln -s ${{env.FULLBENCH_REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
-          python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run testcase
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          timeout_minutes: 240
+          command: |
+            . ${{env.CONDA_PATH}}/bin/activate
+            conda activate ${{env.CONDA_ENV}}
+            conda info --envs
+            export from_tf=TRUE
+            opencompass /fs-computility/llm/qa-llm-cicd/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
+            rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
+            python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
 
 
   notify_to_feishu:
     if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
-    needs: [daily_run_test, fullbench_run_test]
+    needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
     environment: 'prod'
     timeout-minutes: 5
     runs-on: self-hosted
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
index bc829eab..ef067720 100644
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@@ -18,18 +18,23 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  CONDA_ENV: opencompass_
-  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
-  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  CONDA_ENV: pr_test
   HF_DATASETS_OFFLINE: 1
+  HF_EVALUATE_OFFLINE: 1
   TRANSFORMERS_OFFLINE: 1
-  HF_HUB_OFFLINE: 1
   VLLM_USE_MODELSCOPE: false
   LMDEPLOY_USE_MODELSCOPE: false
+  HF_HUB_OFFLINE: 1
+  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
+  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
+  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
+  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
+  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
+  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
 
 jobs:
   pr_run_test:
-    runs-on: dsw_cu12
+    runs-on: volc_cu12_local
     environment: 'prod'
     timeout-minutes: 30
     steps:
@@ -37,54 +42,55 @@ jobs:
         uses: actions/checkout@v2
       - name: Prepare - Install opencompass
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
+          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
           conda info --envs
-      - name: Prepare - prepare data and hf model
+      - name: conda env
         run: |
-          cp -r ${{env.USERSPACE_PREFIX}}/data .
-          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
-          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          pip list
+          lmdeploy check_env
       - name:  Run test
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           conda info --envs
           rm -rf regression_result
-          opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug
-          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2
-          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2
+          opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
+          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
+          opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
       - name:  Get result
         run: |
-          score=$(sed -n '$p' regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
           if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
              echo "score is $score between 88 and 89"
           else
              echo "score is $score not between 88 and 89"
              exit 1
           fi
-          score=$(sed -n '$p' regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
           if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
              echo "score is $score between 87 and 88"
           else
              echo "score is $score not between 87 and 88"
              exit 1
           fi
-          score=$(sed -n '$p' regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
+          score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
           if (( ${score%.*} >= 87 && ${score%.*} <= 89 )); then
              echo "score is $score between 87 and 89"
           else
              echo "score is $score not between 87 and 89"
              exit 1
           fi
-          rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3
       - name:  Uninstall opencompass
         if: always()
         run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}${{ runner.name }}
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
           python3 -m pip uninstall opencompass -y
           conda info --envs