[ci] update daily testcase (#1285)

* Update daily-run-test.yml * Create eval_regression_chat.py * Delete .github/scripts/.github/scripts/eval_regression_chat.py * Create eval_regression_chat.py * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update oc_score_baseline.yaml * Update oc_score_assert.py * Update daily-run-test.yml * Update daily-run-test.yml * Update oc_score_baseline.yaml * Update oc_score_assert.py * Update oc_score_assert.py * fix lint * update * update * update * update * update * update * update * update * update * Update daily-run-test.yml * update --------- Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2025-05-30 16:03:24 +08:00 · 2024-07-03 18:56:09 +08:00 · 2024-07-03 18:56:09 +08:00 · 167cfdcca3
commit 167cfdcca3
parent 28eba6fe34
7 changed files with 368 additions and 52 deletions
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -0,0 +1,52 @@
+from mmengine.config import read_base
+
+with read_base():
+    # choose a list of datasets
+    from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+        gsm8k_datasets  # noqa: F401, E501
+    from ...configs.datasets.race.race_ppl import \
+        race_datasets  # noqa: F401, E501
+    from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \
+        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
+    # read hf models - chat models
+    from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \
+        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
+    from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \
+        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
+    from ...configs.models.gemma.hf_gemma_2b import \
+        models as hf_gemma_2b_model  # noqa: F401, E501
+    from ...configs.models.gemma.hf_gemma_7b import \
+        models as hf_gemma_7b_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
+        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \
+        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
+        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from ...configs.models.hf_llama.lmdeploy_llama3_8b import \
+        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
+    from ...configs.models.mistral.hf_mistral_7b_v0_2 import \
+        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
+    from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \
+        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
+    from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
+        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
+    from ...configs.models.qwen.hf_qwen2_0_5b import \
+        models as hf_qwen2_0_5b_model  # noqa: F401, E501
+    from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \
+        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
+    from ...configs.models.qwen.lmdeploy_qwen2_7b import \
+        models as lmdeploy_qwen2_7b_model  # noqa: F401, E501
+    from ...configs.models.qwen.vllm_qwen1_5_0_5b import \
+        models as vllm_qwen1_5_0_5b_model  # noqa: F401, E501
+    from ...configs.models.yi.hf_yi_1_5_6b import \
+        models as hf_yi_1_5_6b_model  # noqa: F401, E501
+    from ...configs.models.yi.hf_yi_1_5_9b import \
+        models as hf_yi_1_5_9b_model  # noqa: F401, E501
+    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:100]'
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -0,0 +1,70 @@
+from mmengine.config import read_base
+
+with read_base():
+    # choose a list of datasets
+    from ...configs.datasets.gsm8k.gsm8k_gen import \
+        gsm8k_datasets  # noqa: F401, E501
+    from ...configs.datasets.race.race_gen import \
+        race_datasets  # noqa: F401, E501
+    # read hf models - chat models
+    from ...configs.models.baichuan.hf_baichuan2_7b_chat import \
+        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
+    from ...configs.models.chatglm.hf_glm4_9b_chat import \
+        models as hf_glm4_9b_chat_model  # noqa: F401, E501
+    from ...configs.models.deepseek.hf_deepseek_7b_chat import \
+        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
+    from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \
+        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
+    from ...configs.models.deepseek.vllm_deepseek_7b_chat import \
+        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
+    from ...configs.models.gemma.hf_gemma_2b_it import \
+        models as hf_gemma_2b_it_model  # noqa: F401, E501
+    from ...configs.models.gemma.hf_gemma_7b_it import \
+        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
+        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
+        models as lmdeploy_internlm2_chat_1_8b_sft_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
+        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
+        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
+    from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \
+        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
+    from ...configs.models.hf_llama.hf_llama3_8b_instruct import \
+        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
+    from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
+        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
+        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
+    from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
+        models as hf_minicpm_2b_sft_bf16_model  # noqa: F401, E501
+    from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
+        models as hf_minicpm_2b_sft_fp32_model  # noqa: F401, E501
+    from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \
+        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+    from ...configs.models.phi.hf_phi_3_small_8k_instruct import \
+        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
+    from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \
+        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
+    from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
+        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
+    from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model  # noqa: F401, E501
+    from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
+        models as vllm_qwen1_5_0_5b_chat_model  # noqa: F401, E501
+    from ...configs.models.yi.hf_yi_1_5_6b_chat import \
+        models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
+    from ...configs.models.yi.hf_yi_1_5_9b_chat import \
+        models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
+    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+
+for d in datasets:
+    d['reader_cfg']['test_range'] = '[0:100]'
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -6,11 +6,26 @@ import yaml

 output_path = 'regression_result_daily'

-model_list = ['internlm2-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf']
-dataset_list = [
-    'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa',
-    'openbookqa_fact'
+chat_model_list = [
+    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind',
+    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
+    'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf',
+    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
+    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
+    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
+    'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf'
 ]
+base_model_list = [
+    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
+    'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
+    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
+    'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf',
+    'yi-1.5-9b-hf'
+]
+dataset_list = ['gsm8k', 'race-middle', 'race-high']


@pytest.fixture()
@ -32,10 +47,28 @@ def result_scores():

@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.chat
 class TestChat:
    """Test cases for chat model."""

-    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
+    @pytest.mark.parametrize('model, dataset', [(p1, p2)
+                                                for p1 in chat_model_list
+                                                for p2 in dataset_list])
+    def test_model_dataset_score(self, baseline_scores, result_scores, model,
+                                 dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, base_score)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
+@pytest.mark.base
+class TestBase:
+    """Test cases for base model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2)
+                                                for p1 in base_model_list
                                                for p2 in dataset_list])
    def test_model_dataset_score(self, baseline_scores, result_scores, model,
                                 dataset):
@ -47,13 +80,13 @@ class TestChat:
 def assert_score(score, baseline):
    if score is None or score == '-':
        assert False, 'value is none'
-    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
-        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
-              str(baseline * 1.03))
+    if float(score) <= (baseline + 5) and float(score) >= (baseline - 5):
+        print(score + ' between ' + str(baseline - 5) + ' and ' +
+              str(baseline + 5))
        assert True
    else:
        assert False, score + ' not between ' + str(
-            baseline * 0.97) + ' and ' + str(baseline * 1.03)
+            baseline - 5) + ' and ' + str(baseline + 5)


 def find_csv_files(directory):
@ -62,11 +95,11 @@ def find_csv_files(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
-    if len(csv_files) > 1:
-        raise 'have more than 1 result file, please check the result manually'
-    if len(csv_files) == 0:
-        return None
-    return csv_files[0]
+
+    csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
+    sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1])
+    latest_csv_file = sorted_csv_files[-1][0]
+    return latest_csv_file


 def read_csv_file(file_path):
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -1,31 +1,180 @@
-internlm-7b-hf:
-    ARC-c: 34.24
-    chid-dev: 79.70
-    chid-test: 81.12
-    openai_humaneval: 10.98
-    openbookqa: 47.20
-    openbookqa_fact: 74.00
+baichuan2-7b-chat-hf:
+    gsm8k: 30
+    race-middle: 74
+    race-high: 79

-internlm-chat-7b-hf:
-    ARC-c: 36.95
-    chid-dev: 71.78
-    chid-test: 76.87
-    openai_humaneval: 21.34
-    openbookqa: 66.6
-    openbookqa_fact: 80.4
+deepseek-7b-chat-hf:
+    gsm8k: 60
+    race-middle: 74
+    race-high: 80

-chatglm3-6b-base-hf:
-    ARC-c: 44.41
-    chid-dev: 78.22
-    chid-test: 78.57
-    openai_humaneval: 20.73
-    openbookqa: 78.40
-    openbookqa_fact: 92.00
+deepseek-moe-16b-chat-hf:
+    gsm8k: 62
+    race-middle: 62
+    race-high: 70

-internlm2-7b-hf:
-    ARC-c: 36.27
-    chid-dev: 55.94
-    chid-test: 53.70
-    openai_humaneval: 45.12
-    openbookqa: 80.00
-    openbookqa_fact: 86.40
+gemma-2b-it-hf:
+    gsm8k: 14
+    race-middle: 62
+    race-high: 52
+
+gemma-7b-it-hf:
+    gsm8k: 39
+    race-middle: 74
+    race-high: 71
+
+internlm2-chat-1.8b-turbomind:
+    gsm8k: 40
+    race-middle: 82
+    race-high: 83
+
+internlm2-chat-1.8b-sft-turbomind:
+    gsm8k: 32
+    race-middle: 81
+    race-high: 83
+
+internlm2-chat-7b-turbomind:
+    gsm8k: 69
+    race-middle: 90
+    race-high: 88
+
+internlm2-chat-7b-sft-turbomind:
+    gsm8k: 71
+    race-middle: 91
+    race-high: 92
+
+llama-3-8b-instruct-hf:
+    gsm8k: 77
+    race-middle: 85
+    race-high: 87
+
+llama-3-8b-instruct-turbomind:
+    gsm8k: 77
+    race-middle: 85
+    race-high: 89
+
+mistral-7b-instruct-v0.2-hf:
+    gsm8k: 48
+    race-middle: 82
+    race-high: 78
+
+minicpm-2b-dpo-fp32-hf:
+    gsm8k: 58
+    race-middle: 66
+    race-high: 74
+
+minicpm-2b-sft-bf16-hf:
+    gsm8k: 58
+    race-middle: 75
+    race-high: 81
+
+minicpm-2b-sft-fp32-hf:
+    gsm8k: 58
+    race-middle: 75
+    race-high: 81
+
+phi-3-mini-4k-instruct-hf:
+    gsm8k: 67
+    race-middle: 81
+    race-high: 84
+
+qwen1.5-0.5b-chat-hf:
+    gsm8k: 5
+    race-middle: 55
+    race-high: 50
+
+qwen2-1.5b-instruct-turbomind:
+    gsm8k: 60
+    race-middle: 77
+    race-high: 86
+
+qwen2-7b-instruct-turbomind:
+    gsm8k: 88
+    race-middle: 87
+    race-high: 89
+
+yi-1.5-6b-chat-hf:
+    gsm8k: 72
+    race-middle: 88
+    race-high: 86
+
+yi-1.5-9b-chat-hf:
+    gsm8k: 81
+    race-middle: 89
+    race-high: 91
+
+deepseek-moe-16b-base-hf:
+    gsm8k: 25
+    race-middle: 35
+    race-high: 23
+
+
+deepseek-7b-base-turbomind:
+    gsm8k: 21
+    race-middle: 42
+    race-high: 42
+
+gemma-2b-hf:
+    gsm8k: 19
+    race-middle: 33
+    race-high: 26
+
+gemma-7b-hf:
+    gsm8k: 65
+    race-middle: 59
+    race-high: 66
+
+internlm2-1.8b-turbomind:
+    gsm8k: 27
+    race-middle: 75
+    race-high: 72
+
+internlm2-7b-turbomind:
+    gsm8k: 67
+    race-middle: 78
+    race-high: 76
+
+internlm2-base-7b-turbomind:
+    gsm8k: 39
+    race-middle: 75
+    race-high: 81
+
+llama-3-8b-turbomind:
+    gsm8k: 52
+    race-middle: 63
+    race-high: 70
+
+mistral-7b-v0.2-hf:
+    gsm8k: 43
+    race-middle: 42
+    race-high: 60
+
+qwen1.5-moe-a2.7b-hf:
+    gsm8k: 64
+    race-middle: 78
+    race-high: 90
+
+qwen2-0.5b-hf:
+    gsm8k: 35
+    race-middle: 52
+    race-high: 48
+
+qwen2-1.5b-turbomind:
+    gsm8k: 57
+    race-middle: 64
+    race-high: 78
+
+qwen2-7b-turbomind:
+    gsm8k: 83
+    race-middle: 88
+    race-high: 88
+
+yi-1.5-6b-hf:
+    gsm8k: 59
+    race-middle: 81
+    race-high: 89
+
+yi-1.5-9b-hf:
+    gsm8k: 77
+    race-middle: 90
+    race-high: 90
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -31,34 +31,46 @@ jobs:
          eval "$(conda shell.bash hook)"
          conda create -y --name ${{env.CONDA_ENV}} python=3.10
          conda activate ${{env.CONDA_ENV}}
-          pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-*.whl
+          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.0.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl --index-url https://download.pytorch.org/whl/cu118
+          pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} --extra-index-url https://download.pytorch.org/whl/cu118
          conda info --envs
      - name: Prepare - Pip install code
        run: |
          eval "$(conda shell.bash hook)"
          conda activate ${{env.CONDA_ENV}}
          pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install human_eval transformers protobuf --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
      - name: Prepare - prepare data and hf model
        run: |
          cp -r ${{env.USERSPACE_PREFIX}}/data .
          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
-      - name:  Run test
+      - name:  Run chat model test
        run: |
          eval "$(conda shell.bash hook)"
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result_daily
          export from_tf=TRUE
-          python3 run.py --models hf_internlm_chat_7b hf_internlm2_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily
-      - name:  Get result
+          rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
+          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
+          cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
+          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Run base model test
        run: |
          eval "$(conda shell.bash hook)"
-          pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
-          python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          rm -rf regression_result_daily
+          export from_tf=TRUE
+          rm -rf /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary
+          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }} --reuse
+          cp -r /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/*/summary regression_result_daily
+          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Remove Conda Env
        if: always()
        run: |
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -31,7 +31,7 @@ jobs:
    environment: 'prod'
    timeout-minutes: 30
    steps:
-      - name: Clone repository
+      - name: Checkout repository
        uses: actions/checkout@v2
      - name: Prepare - Install opencompass
        run: |
--- a/configs/models/deepseek/vllm_deepseek_moe_16b_base.py
+++ b/configs/models/deepseek/vllm_deepseek_moe_16b_base.py
@ -3,7 +3,7 @@ from opencompass.models import VLLM
 models = [
    dict(
        type=VLLM,
-        abbr='deepseek-moe-16b-base-hf',
+        abbr='deepseek-moe-16b-base-vllm',
        path='deepseek-ai/deepseek-moe-16b-base',
        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6),
        max_out_len=1024,