From fb6a0df652ebc76b1a2c61f3f4a3ec8fd865dadf Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 4 Sep 2024 19:24:09 +0800 Subject: [PATCH] [ci] fix test env for vllm and add vllm baselines (#1481) * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update --------- Co-authored-by: zhulin1 --- .github/scripts/oc_score_assert.py | 34 ++++++---- .github/scripts/oc_score_baseline.yaml | 65 ++++++++++++++++--- .github/workflows/daily-run-test.yml | 48 ++++++++++---- .../advanced_guides/evaluation_turbomind.md | 2 +- .../advanced_guides/evaluation_turbomind.md | 2 +- opencompass/models/lmdeploy_pytorch.py | 22 ++++++- opencompass/models/turbomind.py | 6 +- .../models/turbomind_with_tf_above_v4_33.py | 12 ++-- 8 files changed, 142 insertions(+), 49 deletions(-) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index c454b772..1397db88 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -8,25 +8,29 @@ output_path = 'regression_result_daily' chat_model_list = [ 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf', - 'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', - 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind', - 'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind', - 'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind', - 'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf', + 'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf', + 'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind', + 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', + 'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind', + 'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf', + 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', + 'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', - 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', + 'phi-3-mini-4k-instruct-hf', 'phi-3-small-8k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind', + 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf', - 'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind', - 'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf', - 'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf', - 'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', + 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', + 'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf', + 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', + 'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind', + 'internlm2-7b-turbomind', 'internlm2-base-7b-hf', + 'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind', + 'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', - 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' ] dataset_list = ['gsm8k', 'race-middle', 'race-high'] @@ -75,6 +79,8 @@ class TestBase: for p2 in dataset_list]) def test_model_dataset_score(self, baseline_scores, result_scores, model, dataset): + if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high': + return base_score = baseline_scores.get(model).get(dataset) result_score = result_scores.get(model).get(dataset) assert_score(result_score, base_score) diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 8ec8a5f7..d7e765be 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -18,6 +18,11 @@ deepseek-moe-16b-chat-hf: race-middle: 62 race-high: 70 +deepseek-7b-chat-vllm: + gsm8k: 63 + race-middle: 74 + race-high: 79 + gemma-2b-it-hf: gsm8k: 14 race-middle: 62 @@ -58,6 +63,11 @@ internlm2-chat-7b-sft-turbomind: race-middle: 91 race-high: 92 +internlm2-chat-7b-vllm: + gsm8k: 63 + race-middle: 90 + race-high: 91 + llama-3-8b-instruct-hf: gsm8k: 77 race-middle: 85 @@ -73,6 +83,11 @@ mistral-7b-instruct-v0.2-hf: race-middle: 82 race-high: 78 +mistral-7b-instruct-v0.2-vllm: + gsm8k: 49 + race-middle: 81 + race-high: 77 + minicpm-2b-dpo-fp32-hf: gsm8k: 58 race-middle: 66 @@ -93,6 +108,11 @@ phi-3-mini-4k-instruct-hf: race-middle: 81 race-high: 84 +phi-3-small-8k-instruct-hf: + gsm8k: 88 + race-middle: 89 + race-high: 88 + qwen1.5-0.5b-chat-hf: gsm8k: 5 race-middle: 55 @@ -108,6 +128,11 @@ qwen2-7b-instruct-turbomind: race-middle: 87 race-high: 89 +qwen1.5-0.5b-chat-vllm: + gsm8k: 5 + race-middle: 57 + race-high: 51 + yi-1.5-6b-chat-hf: gsm8k: 72 race-middle: 88 @@ -118,21 +143,26 @@ yi-1.5-9b-chat-hf: race-middle: 89 race-high: 91 -deepseek-moe-16b-base-hf: - gsm8k: 25 - race-middle: 35 - race-high: 23 - lmdeploy-api-test: gsm8k: 90 race-middle: 95 race-high: 96 +deepseek-moe-16b-base-hf: + gsm8k: 25 + race-middle: 35 + race-high: 23 + deepseek-7b-base-turbomind: gsm8k: 21 race-middle: 42 race-high: 42 +deepseek-moe-16b-base-vllm: + gsm8k: 22 + race-middle: 35 + race-high: 20 + gemma-2b-hf: gsm8k: 19 race-middle: 33 @@ -148,6 +178,16 @@ internlm2_5-7b-hf: race-middle: 92 race-high: 91 +internlm2-7b-hf: + gsm8k: 65 + race-middle: 77 + race-high: 72 + +internlm2-base-7b-hf: + gsm8k: 5 + race-middle: 71 + race-high: 74 + internlm2_5-7b-turbomind: gsm8k: 73 race-middle: 90 @@ -163,11 +203,6 @@ internlm2-7b-turbomind: race-middle: 78 race-high: 76 -internlm2-base-7b-hf: - gsm8k: 2 - race-middle: 71 - race-high: 74 - internlm2-base-7b-turbomind: gsm8k: 39 race-middle: 75 @@ -183,6 +218,11 @@ mistral-7b-v0.2-hf: race-middle: 42 race-high: 60 +mistral-7b-v0.2-vllm: + gsm8k: 45 + race-middle: 42 + race-high: 58 + qwen1.5-moe-a2.7b-hf: gsm8k: 64 race-middle: 78 @@ -203,6 +243,11 @@ qwen2-7b-turbomind: race-middle: 88 race-high: 88 +qwen1.5-0.5b-vllm: + gsm8k: 12 + race-middle: 54 + race-high: 59 + yi-1.5-6b-hf: gsm8k: 59 race-middle: 81 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index ebe66523..189d6e04 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -18,33 +18,55 @@ env: HF_DATASETS_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 HF_HUB_OFFLINE: 1 + TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas jobs: + build-pypi: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 + - name: Build lagent + run: | + pip install wheel + python setup.py sdist bdist_wheel + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: dist/* + retention-days: 1 + name: my-artifact-${{ github.run_id }} + daily_run_test: + needs: build-pypi runs-on: self-hosted environment: 'prod' - timeout-minutes: 240 #4hours + timeout-minutes: 420 #7hours steps: - name: Clone repository uses: actions/checkout@v2 + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }} - name: Prepare - create conda env and install torch run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate conda create -y --name ${{env.CONDA_ENV}} python=3.10 conda activate ${{env.CONDA_ENV}} + pip install opencompass*.whl pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - pip install bitsandbytes - pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 - pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}} - conda info --envs - - name: Prepare - Pip install code - run: | - . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} - pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} - pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs - name: Prepare - prepare data and hf model run: | diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md index 0f08ffec..c1299f0b 100644 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ b/docs/en/advanced_guides/evaluation_turbomind.md @@ -73,6 +73,6 @@ You are expected to get the evaluation results after the inference and evaluatio **Note**: - If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig) - and [EngineGenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) + and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) - If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` - If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line. diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md index 3d1414d8..a7c37b75 100644 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md @@ -70,6 +70,6 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2 **注:** -- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [EngineGenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) +- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) - 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` - 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。 diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py index f9d67da4..92118066 100644 --- a/opencompass/models/lmdeploy_pytorch.py +++ b/opencompass/models/lmdeploy_pytorch.py @@ -60,8 +60,8 @@ class LmdeployPytorchModel(BaseModel): engine_config.thread_safe = True if gen_config is not None: - from lmdeploy.messages import EngineGenerationConfig - gen_config = EngineGenerationConfig(**gen_config) + from lmdeploy.messages import GenerationConfig + gen_config = GenerationConfig(**gen_config) self.logger = get_logger() tm_model = tm.Engine(path, engine_config) @@ -70,6 +70,22 @@ class LmdeployPytorchModel(BaseModel): tm_model.create_instance() for i in range(concurrency) ] self.generator_ids = [i + 1 for i in range(concurrency)] + + from transformers import GenerationConfig + try: + generation_config = GenerationConfig.from_pretrained(path) + except Exception: + generation_config = None + if generation_config and hasattr(generation_config, 'eos_token_id'): + if gen_config.stop_words is None: + stop_words = [] + if isinstance(generation_config.eos_token_id, int): + stop_words.append(generation_config.eos_token_id) + else: + assert isinstance(generation_config.eos_token_id, list) + for token_id in generation_config.eos_token_id: + stop_words.append(token_id) + gen_config.stop_words = stop_words self.gen_config = gen_config self.end_str = end_str self.major_version, self.minor_version, _ = version_info @@ -135,7 +151,7 @@ class LmdeployPytorchModel(BaseModel): prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. end_str (str, optional): Whether to trim generated strings with end_str if the model has special ending strings diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index af769c28..e6cfebd2 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -113,8 +113,8 @@ class TurboMindModel(BaseModel): gen_config['stop_words'] = list(set(stop_words)) gen_config.setdefault('min_new_tokens', 1) - from lmdeploy.messages import EngineGenerationConfig - gen_config = EngineGenerationConfig(**gen_config) + from lmdeploy.messages import GenerationConfig + gen_config = GenerationConfig(**gen_config) results = [] for batch_input in batch_inputs: @@ -160,7 +160,7 @@ class TurboMindModel(BaseModel): The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. end_str (str, optional): Whether to trim generated strings with end_str if the model has special ending strings diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 5ea020a5..ccda31f6 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -115,11 +115,16 @@ class TurboMindModelwithChatTemplate(BaseModel): batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)] stop_words = list(set(self.stop_words + stopping_criteria)) + encode_stop_words = [] + if stop_words is not None and len(stop_words) > 0: + for words in stop_words: + encode_stop_words += self.tokenizer.encode(words, add_bos=False) + DEFAULT_GEN_CONFIG = { 'max_new_tokens': max_out_len, 'min_new_tokens': 1, 'top_k': 1, - 'stop_words': stop_words, + 'stop_words': encode_stop_words, } gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) @@ -127,9 +132,8 @@ class TurboMindModelwithChatTemplate(BaseModel): gen_config['top_k'] = 1000 gen_config['temperature'] = temperature - from lmdeploy.messages import EngineGenerationConfig, GenerationConfig + from lmdeploy.messages import GenerationConfig gen_config = GenerationConfig(**gen_config) - gen_config = EngineGenerationConfig.From(gen_config, self.tokenizer) results = [] for batch_message in batch_messages: @@ -160,7 +164,7 @@ class TurboMindModelwithChatTemplate(BaseModel): prompt (PromptType): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. - gen_config (EngineGenerationConfig, optional): Generation + gen_config (GenerationConfig, optional): Generation config to set arguments like top_k, top_p, temperature. Returns: str: The generated string.