From fb6a0df652ebc76b1a2c61f3f4a3ec8fd865dadf Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 4 Sep 2024 19:24:09 +0800
Subject: [PATCH] [ci] fix test env for vllm and add vllm baselines (#1481)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .github/scripts/oc_score_assert.py            | 34 ++++++----
 .github/scripts/oc_score_baseline.yaml        | 65 ++++++++++++++++---
 .github/workflows/daily-run-test.yml          | 48 ++++++++++----
 .../advanced_guides/evaluation_turbomind.md   |  2 +-
 .../advanced_guides/evaluation_turbomind.md   |  2 +-
 opencompass/models/lmdeploy_pytorch.py        | 22 ++++++-
 opencompass/models/turbomind.py               |  6 +-
 .../models/turbomind_with_tf_above_v4_33.py   | 12 ++--
 8 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index c454b772..1397db88 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -8,25 +8,29 @@ output_path = 'regression_result_daily'
 
 chat_model_list = [
     'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
-    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
-    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
-    'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
-    'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
-    'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
+    'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
+    'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
+    'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
+    'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
+    'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
+    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
+    'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
     'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
-    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
-    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
+    'phi-3-mini-4k-instruct-hf', 'phi-3-small-8k-instruct-hf',
+    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
+    'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm',
     'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
-    'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
-    'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
-    'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
-    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
+    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
+    'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
+    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
+    'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
+    'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
+    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
+    'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
     'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
-    'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']
 
@@ -75,6 +79,8 @@ class TestBase:
                                                 for p2 in dataset_list])
     def test_model_dataset_score(self, baseline_scores, result_scores, model,
                                  dataset):
+        if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
+            return
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
         assert_score(result_score, base_score)
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index 8ec8a5f7..d7e765be 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -18,6 +18,11 @@ deepseek-moe-16b-chat-hf:
     race-middle: 62
     race-high: 70
 
+deepseek-7b-chat-vllm:
+    gsm8k: 63
+    race-middle: 74
+    race-high: 79
+
 gemma-2b-it-hf:
     gsm8k: 14
     race-middle: 62
@@ -58,6 +63,11 @@ internlm2-chat-7b-sft-turbomind:
     race-middle: 91
     race-high: 92
 
+internlm2-chat-7b-vllm:
+    gsm8k: 63
+    race-middle: 90
+    race-high: 91
+
 llama-3-8b-instruct-hf:
     gsm8k: 77
     race-middle: 85
@@ -73,6 +83,11 @@ mistral-7b-instruct-v0.2-hf:
     race-middle: 82
     race-high: 78
 
+mistral-7b-instruct-v0.2-vllm:
+    gsm8k: 49
+    race-middle: 81
+    race-high: 77
+
 minicpm-2b-dpo-fp32-hf:
     gsm8k: 58
     race-middle: 66
@@ -93,6 +108,11 @@ phi-3-mini-4k-instruct-hf:
     race-middle: 81
     race-high: 84
 
+phi-3-small-8k-instruct-hf:
+    gsm8k: 88
+    race-middle: 89
+    race-high: 88
+
 qwen1.5-0.5b-chat-hf:
     gsm8k: 5
     race-middle: 55
@@ -108,6 +128,11 @@ qwen2-7b-instruct-turbomind:
     race-middle: 87
     race-high: 89
 
+qwen1.5-0.5b-chat-vllm:
+    gsm8k: 5
+    race-middle: 57
+    race-high: 51
+
 yi-1.5-6b-chat-hf:
     gsm8k: 72
     race-middle: 88
@@ -118,21 +143,26 @@ yi-1.5-9b-chat-hf:
     race-middle: 89
     race-high: 91
 
-deepseek-moe-16b-base-hf:
-    gsm8k: 25
-    race-middle: 35
-    race-high: 23
-
 lmdeploy-api-test:
     gsm8k: 90
     race-middle: 95
     race-high: 96
 
+deepseek-moe-16b-base-hf:
+    gsm8k: 25
+    race-middle: 35
+    race-high: 23
+
 deepseek-7b-base-turbomind:
     gsm8k: 21
     race-middle: 42
     race-high: 42
 
+deepseek-moe-16b-base-vllm:
+    gsm8k: 22
+    race-middle: 35
+    race-high: 20
+
 gemma-2b-hf:
     gsm8k: 19
     race-middle: 33
@@ -148,6 +178,16 @@ internlm2_5-7b-hf:
     race-middle: 92
     race-high: 91
 
+internlm2-7b-hf:
+    gsm8k: 65
+    race-middle: 77
+    race-high: 72
+
+internlm2-base-7b-hf:
+    gsm8k: 5
+    race-middle: 71
+    race-high: 74
+
 internlm2_5-7b-turbomind:
     gsm8k: 73
     race-middle: 90
@@ -163,11 +203,6 @@ internlm2-7b-turbomind:
     race-middle: 78
     race-high: 76
 
-internlm2-base-7b-hf:
-    gsm8k: 2
-    race-middle: 71
-    race-high: 74
-
 internlm2-base-7b-turbomind:
     gsm8k: 39
     race-middle: 75
@@ -183,6 +218,11 @@ mistral-7b-v0.2-hf:
     race-middle: 42
     race-high: 60
 
+mistral-7b-v0.2-vllm:
+    gsm8k: 45
+    race-middle: 42
+    race-high: 58
+
 qwen1.5-moe-a2.7b-hf:
     gsm8k: 64
     race-middle: 78
@@ -203,6 +243,11 @@ qwen2-7b-turbomind:
     race-middle: 88
     race-high: 88
 
+qwen1.5-0.5b-vllm:
+    gsm8k: 12
+    race-middle: 54
+    race-high: 59
+
 yi-1.5-6b-hf:
     gsm8k: 59
     race-middle: 81
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index ebe66523..189d6e04 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -18,33 +18,55 @@ env:
   HF_DATASETS_OFFLINE: 1
   TRANSFORMERS_OFFLINE: 1
   HF_HUB_OFFLINE: 1
+  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
 
 jobs:
+  build-pypi:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Build lagent
+        run: |
+          pip install wheel
+          python setup.py sdist bdist_wheel
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: dist/*
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}
+
   daily_run_test:
+    needs: build-pypi
     runs-on: self-hosted
     environment: 'prod'
-    timeout-minutes: 240 #4hours
+    timeout-minutes: 420 #7hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
+      - name: Download Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}
       - name: Prepare - create conda env and install torch
         run: |
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
           conda create -y --name ${{env.CONDA_ENV}} python=3.10
           conda activate ${{env.CONDA_ENV}}
+          pip install opencompass*.whl
           pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          pip install bitsandbytes
-          pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
-          pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}}
-          conda info --envs
-      - name: Prepare - Pip install code
-        run: |
-          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
-          pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip uninstall torch torchvision torchaudio -y
+          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
           conda info --envs
       - name: Prepare - prepare data and hf model
         run: |
diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md
index 0f08ffec..c1299f0b 100644
--- a/docs/en/advanced_guides/evaluation_turbomind.md
+++ b/docs/en/advanced_guides/evaluation_turbomind.md
@@ -73,6 +73,6 @@ You are expected to get the evaluation results after the inference and evaluatio
 **Note**:
 
 - If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
-  and [EngineGenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
+  and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
 - If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
 - If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.
diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
index 3d1414d8..a7c37b75 100644
--- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md
+++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
@@ -70,6 +70,6 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2
 
 **注：**
 
-- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [EngineGenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
+- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
 - 如果评测 InternLM Chat 模型，请使用配置文件 `eval_internlm_chat_turbomind.py`
 - 如果评测 InternLM 7B 模型，请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。
diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py
index f9d67da4..92118066 100644
--- a/opencompass/models/lmdeploy_pytorch.py
+++ b/opencompass/models/lmdeploy_pytorch.py
@@ -60,8 +60,8 @@ class LmdeployPytorchModel(BaseModel):
                 engine_config.thread_safe = True
 
         if gen_config is not None:
-            from lmdeploy.messages import EngineGenerationConfig
-            gen_config = EngineGenerationConfig(**gen_config)
+            from lmdeploy.messages import GenerationConfig
+            gen_config = GenerationConfig(**gen_config)
 
         self.logger = get_logger()
         tm_model = tm.Engine(path, engine_config)
@@ -70,6 +70,22 @@ class LmdeployPytorchModel(BaseModel):
             tm_model.create_instance() for i in range(concurrency)
         ]
         self.generator_ids = [i + 1 for i in range(concurrency)]
+
+        from transformers import GenerationConfig
+        try:
+            generation_config = GenerationConfig.from_pretrained(path)
+        except Exception:
+            generation_config = None
+        if generation_config and hasattr(generation_config, 'eos_token_id'):
+            if gen_config.stop_words is None:
+                stop_words = []
+            if isinstance(generation_config.eos_token_id, int):
+                stop_words.append(generation_config.eos_token_id)
+            else:
+                assert isinstance(generation_config.eos_token_id, list)
+                for token_id in generation_config.eos_token_id:
+                    stop_words.append(token_id)
+            gen_config.stop_words = stop_words
         self.gen_config = gen_config
         self.end_str = end_str
         self.major_version, self.minor_version, _ = version_info
@@ -135,7 +151,7 @@ class LmdeployPytorchModel(BaseModel):
             prompt (PromptType): A string or PromptDict.
                 The PromptDict should be organized in OpenCompass'
                 API format.
-            gen_config (EngineGenerationConfig, optional): Generation
+            gen_config (GenerationConfig, optional): Generation
                 config to set arguments like top_k, top_p, temperature.
             end_str (str, optional): Whether to trim generated strings
                 with end_str if the model has special ending strings
diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py
index af769c28..e6cfebd2 100644
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@@ -113,8 +113,8 @@ class TurboMindModel(BaseModel):
             gen_config['stop_words'] = list(set(stop_words))
         gen_config.setdefault('min_new_tokens', 1)
 
-        from lmdeploy.messages import EngineGenerationConfig
-        gen_config = EngineGenerationConfig(**gen_config)
+        from lmdeploy.messages import GenerationConfig
+        gen_config = GenerationConfig(**gen_config)
 
         results = []
         for batch_input in batch_inputs:
@@ -160,7 +160,7 @@ class TurboMindModel(BaseModel):
                 The PromptDict should be organized in OpenCompass'
                 API format.
             max_out_len (int): The maximum length of the output.
-            gen_config (EngineGenerationConfig, optional): Generation
+            gen_config (GenerationConfig, optional): Generation
                 config to set arguments like top_k, top_p, temperature.
             end_str (str, optional): Whether to trim generated strings
                 with end_str if the model has special ending strings
diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py
index 5ea020a5..ccda31f6 100644
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@@ -115,11 +115,16 @@ class TurboMindModelwithChatTemplate(BaseModel):
         batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
 
         stop_words = list(set(self.stop_words + stopping_criteria))
+        encode_stop_words = []
+        if stop_words is not None and len(stop_words) > 0:
+            for words in stop_words:
+                encode_stop_words += self.tokenizer.encode(words, add_bos=False)
+
         DEFAULT_GEN_CONFIG = {
             'max_new_tokens': max_out_len,
             'min_new_tokens': 1,
             'top_k': 1,
-            'stop_words': stop_words,
+            'stop_words': encode_stop_words,
         }
         gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
         gen_config.update(self.gen_config)
@@ -127,9 +132,8 @@ class TurboMindModelwithChatTemplate(BaseModel):
             gen_config['top_k'] = 1000
             gen_config['temperature'] = temperature
 
-        from lmdeploy.messages import EngineGenerationConfig, GenerationConfig
+        from lmdeploy.messages import GenerationConfig
         gen_config = GenerationConfig(**gen_config)
-        gen_config = EngineGenerationConfig.From(gen_config, self.tokenizer)
 
         results = []
         for batch_message in batch_messages:
@@ -160,7 +164,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
             prompt (PromptType): A string or PromptDict.
                 The PromptDict should be organized in OpenCompass'
                 API format.
-            gen_config (EngineGenerationConfig, optional): Generation
+            gen_config (GenerationConfig, optional): Generation
                 config to set arguments like top_k, top_p, temperature.
         Returns:
             str: The generated string.