[CI] add more models into testcase and test env of cu12 (#1558)

* update * update * Update pr-run-test.yml * update * update * update * update * Update daily-run-test.yml * update * updaste * update * update * update * Update daily-run-test.yml * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2025-05-30 16:03:24 +08:00 · 2024-09-25 17:07:27 +08:00 · 2024-09-25 17:07:27 +08:00 · aa43eaf267
commit aa43eaf267
parent 87df8a73a3
6 changed files with 235 additions and 59 deletions
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -8,15 +8,17 @@ with read_base():
        race_datasets  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
+        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b import \
-        models as hf_gemma_2b_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b import \
-        models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b import \
+        models as hf_gemma2_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b import \
+        models as hf_gemma2_9b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
@ -31,16 +33,28 @@ with read_base():
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
+        models as hf_llama2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
+        models as hf_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
+        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
-        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
+        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
+        models as vllm_mixtral_8x7b_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
        models as hf_qwen2_0_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
+        models as hf_qwen2_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b import \
+        models as hf_qwen2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -13,20 +13,32 @@ with read_base():
        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
        models as hf_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
+        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
+        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
-        models as hf_gemma_2b_it_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
-        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
+        models as hf_gemma2_2b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
+        models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
+        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
+        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
+        models as lmdeploy_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
@ -37,14 +49,20 @@ with read_base():
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
+        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
-        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
+        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
+        models as vllm_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -57,6 +75,10 @@ with read_base():
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
+        models as hf_qwen2_1_5b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
+        models as hf_qwen2_7b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -7,30 +7,35 @@ import yaml
 output_path = 'regression_result_daily'

 chat_model_list = [
-    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
-    'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
-    'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
-    'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
-    'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
-    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
-    'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
-    'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
-    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
+    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-turbomind', 'glm-4-9b-chat-vllm',
+    'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
+    'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
+    'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
+    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
+    'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
+    'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
+    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
+    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
    'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
    'lmdeploy-api-test'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
-    'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
-    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
-    'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
-    'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
-    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
-    'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
-    'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
-    'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
+    'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
+    'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
+    'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
+    'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
+    'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
+    'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
+    'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
+    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']

--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -8,6 +8,16 @@ glm-4-9b-chat-hf:
    race-middle: 88
    race-high: 88

+glm-4-9b-chat-turbomind:
+    gsm8k: 69
+    race-middle: 82
+    race-high: 77
+
+glm-4-9b-chat-vllm:
+    gsm8k: 73
+    race-middle: 87
+    race-high: 87
+
 deepseek-7b-chat-hf:
    gsm8k: 60
    race-middle: 74
@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf:
    race-middle: 62
    race-high: 70

+deepseek-v2-lite-chat-hf:
+    gsm8k: 59
+    race-middle: 82
+    race-high: 79
+
 deepseek-7b-chat-vllm:
    gsm8k: 63
    race-middle: 74
@ -33,23 +48,48 @@ gemma-7b-it-hf:
    race-middle: 74
    race-high: 71

+gemma-7b-it-vllm:
+    gsm8k: 38
+    race-middle: 75
+    race-high: 70
+
+gemma2-2b-it-hf:
+    gsm8k: 62
+    race-middle: 75
+    race-high: 67
+
+gemma2-9b-it-hf:
+    gsm8k: 80
+    race-middle: 89
+    race-high: 85
+
 internlm2_5-7b-chat-hf:
    gsm8k: 86
    race-middle: 92
    race-high: 93

+internlm2_5-20b-chat-hf:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2_5-7b-chat-turbomind:
    gsm8k: 87
    race-middle: 92
    race-high: 93

+internlm2_5-20b-chat-turbomind:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2-chat-1.8b-turbomind:
    gsm8k: 40
    race-middle: 82
    race-high: 83

 internlm2-chat-1.8b-sft-turbomind:
-    gsm8k: 32
+    gsm8k: 34
    race-middle: 81
    race-high: 83

@ -68,11 +108,21 @@ internlm2-chat-7b-vllm:
    race-middle: 90
    race-high: 91

+llama-3_1-8b-instruct-hf:
+    gsm8k: 82
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-hf:
    gsm8k: 77
    race-middle: 85
    race-high: 87

+llama-3_1-8b-instruct-turbomind:
+    gsm8k: 79
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-turbomind:
    gsm8k: 77
    race-middle: 85
@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf:
    race-middle: 82
    race-high: 78

+mistral-7b-instruct-v0.3-hf:
+    gsm8k: 53
+    race-middle: 80
+    race-high: 78
+
 mistral-7b-instruct-v0.2-vllm:
    gsm8k: 49
    race-middle: 81
@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf:
    race-middle: 55
    race-high: 50

+qwen2-1.5b-instruct-hf:
+    gsm8k: 63
+    race-middle: 77
+    race-high: 86
+
 qwen2-1.5b-instruct-turbomind:
    gsm8k: 60
    race-middle: 77
@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind:
    race-middle: 87
    race-high: 89

+qwen2-7b-instruct-hf:
+    gsm8k: 85
+    race-middle: 87
+    race-high: 91
+
 qwen1.5-0.5b-chat-vllm:
    gsm8k: 5
    race-middle: 57
@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf:
    race-middle: 35
    race-high: 23

+deepseek-v2-lite-hf:
+    gsm8k: 37
+    race-middle: 56
+    race-high: 62
+
 deepseek-7b-base-turbomind:
    gsm8k: 21
    race-middle: 42
@ -173,8 +243,18 @@ gemma-7b-hf:
    race-middle: 59
    race-high: 66

+gemma2-2b-hf:
+    gsm8k: 8
+    race-middle: 31
+    race-high: 30
+
+gemma2-9b-hf:
+    gsm8k: 20
+    race-middle: 42
+    race-high: 35
+
 internlm2_5-7b-hf:
-    gsm8k: 46
+    gsm8k: 47
    race-middle: 92
    race-high: 91

@ -208,6 +288,21 @@ internlm2-base-7b-turbomind:
    race-middle: 75
    race-high: 81

+llama-2-7b-hf:
+    gsm8k: 17
+    race-middle: 32
+    race-high: 38
+
+llama-3-8b-hf:
+    gsm8k: 48
+    race-middle: 64
+    race-high: 70
+
+llama-3.1-8b-turbomind:
+    gsm8k: 57
+    race-middle: 67
+    race-high: 75
+
 llama-3-8b-turbomind:
    gsm8k: 52
    race-middle: 63
@ -218,6 +313,11 @@ mistral-7b-v0.2-hf:
    race-middle: 42
    race-high: 60

+mistral-7b-v0.3-hf:
+    gsm8k: 43
+    race-middle: 42
+    race-high: 60
+
 mistral-7b-v0.2-vllm:
    gsm8k: 45
    race-middle: 42
@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf:
    race-middle: 78
    race-high: 90

+qwen2-1.5b-hf:
+    gsm8k: 58
+    race-middle: 65
+    race-high: 78
+
 qwen2-0.5b-hf:
    gsm8k: 35
    race-middle: 52
    race-high: 48

+qwen2-7b-hf:
+    gsm8k: 82
+    race-middle: 88
+    race-high: 89
+
 qwen2-1.5b-turbomind:
    gsm8k: 57
    race-middle: 64
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -14,9 +14,14 @@ env:
  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
  HF_DATASETS_OFFLINE: 1
+  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
+  VLLM_USE_MODELSCOPE: false
+  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas

@ -43,7 +48,11 @@ jobs:

  daily_run_test:
    needs: build-pypi
-    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_env: [dsw_cu11, dsw_cu12]
+    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
    timeout-minutes: 420 #7hours
    steps:
@ -53,22 +62,38 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}
-      - name: Prepare - create conda env and install torch
+      - name: Prepare - create conda env and install torch - cu11
+        if: ${{matrix.cuda_env == 'dsw_cu11'}}
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}} python=3.10
-          conda activate ${{env.CONDA_ENV}}
-          pip install opencompass*.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-
-          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
          pip uninstall torch torchvision torchaudio -y
          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
          pip list
+      - name: Prepare - create conda env and install torch - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12'}}
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir
+          pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip uninstall torch torchvision torchaudio -y
+          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda info --envs
+          pip list
      - name: Prepare - prepare data and hf model
        run: |
          ln -s ${{env.DATEASET_CACHE_PATH}} data
@ -77,45 +102,45 @@ jobs:
      - name:  Run chat model test
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
-          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run command testcase
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_5_1_8b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
+          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
+          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Remove Conda Env
        if: always()
        run: |
          rm -rf regression_result_daily
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}
+          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs

  notify_to_feishu:
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -51,7 +51,7 @@ jobs:
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result
-          python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
+          opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
      - name:  Get result
        run: |
          score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')