[ci] fix test env for vllm and add vllm baselines (#1481)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
This commit is contained in:
zhulinJulia24 2024-09-04 19:24:09 +08:00 committed by GitHub
parent da74cbfa39
commit fb6a0df652
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 142 additions and 49 deletions

View File

@ -8,25 +8,29 @@ output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'phi-3-mini-4k-instruct-hf', 'phi-3-small-8k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm',
'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']
@ -75,6 +79,8 @@ class TestBase:
for p2 in dataset_list])
def test_model_dataset_score(self, baseline_scores, result_scores, model,
dataset):
if model == 'mistral-7b-v0.2-vllm' and dataset == 'race-high':
return
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)

View File

@ -18,6 +18,11 @@ deepseek-moe-16b-chat-hf:
race-middle: 62
race-high: 70
deepseek-7b-chat-vllm:
gsm8k: 63
race-middle: 74
race-high: 79
gemma-2b-it-hf:
gsm8k: 14
race-middle: 62
@ -58,6 +63,11 @@ internlm2-chat-7b-sft-turbomind:
race-middle: 91
race-high: 92
internlm2-chat-7b-vllm:
gsm8k: 63
race-middle: 90
race-high: 91
llama-3-8b-instruct-hf:
gsm8k: 77
race-middle: 85
@ -73,6 +83,11 @@ mistral-7b-instruct-v0.2-hf:
race-middle: 82
race-high: 78
mistral-7b-instruct-v0.2-vllm:
gsm8k: 49
race-middle: 81
race-high: 77
minicpm-2b-dpo-fp32-hf:
gsm8k: 58
race-middle: 66
@ -93,6 +108,11 @@ phi-3-mini-4k-instruct-hf:
race-middle: 81
race-high: 84
phi-3-small-8k-instruct-hf:
gsm8k: 88
race-middle: 89
race-high: 88
qwen1.5-0.5b-chat-hf:
gsm8k: 5
race-middle: 55
@ -108,6 +128,11 @@ qwen2-7b-instruct-turbomind:
race-middle: 87
race-high: 89
qwen1.5-0.5b-chat-vllm:
gsm8k: 5
race-middle: 57
race-high: 51
yi-1.5-6b-chat-hf:
gsm8k: 72
race-middle: 88
@ -118,21 +143,26 @@ yi-1.5-9b-chat-hf:
race-middle: 89
race-high: 91
deepseek-moe-16b-base-hf:
gsm8k: 25
race-middle: 35
race-high: 23
lmdeploy-api-test:
gsm8k: 90
race-middle: 95
race-high: 96
deepseek-moe-16b-base-hf:
gsm8k: 25
race-middle: 35
race-high: 23
deepseek-7b-base-turbomind:
gsm8k: 21
race-middle: 42
race-high: 42
deepseek-moe-16b-base-vllm:
gsm8k: 22
race-middle: 35
race-high: 20
gemma-2b-hf:
gsm8k: 19
race-middle: 33
@ -148,6 +178,16 @@ internlm2_5-7b-hf:
race-middle: 92
race-high: 91
internlm2-7b-hf:
gsm8k: 65
race-middle: 77
race-high: 72
internlm2-base-7b-hf:
gsm8k: 5
race-middle: 71
race-high: 74
internlm2_5-7b-turbomind:
gsm8k: 73
race-middle: 90
@ -163,11 +203,6 @@ internlm2-7b-turbomind:
race-middle: 78
race-high: 76
internlm2-base-7b-hf:
gsm8k: 2
race-middle: 71
race-high: 74
internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75
@ -183,6 +218,11 @@ mistral-7b-v0.2-hf:
race-middle: 42
race-high: 60
mistral-7b-v0.2-vllm:
gsm8k: 45
race-middle: 42
race-high: 58
qwen1.5-moe-a2.7b-hf:
gsm8k: 64
race-middle: 78
@ -203,6 +243,11 @@ qwen2-7b-turbomind:
race-middle: 88
race-high: 88
qwen1.5-0.5b-vllm:
gsm8k: 12
race-middle: 54
race-high: 59
yi-1.5-6b-hf:
gsm8k: 59
race-middle: 81

View File

@ -18,33 +18,55 @@ env:
HF_DATASETS_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
jobs:
build-pypi:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Build lagent
run: |
pip install wheel
python setup.py sdist bdist_wheel
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
if-no-files-found: error
path: dist/*
retention-days: 1
name: my-artifact-${{ github.run_id }}
daily_run_test:
needs: build-pypi
runs-on: self-hosted
environment: 'prod'
timeout-minutes: 240 #4hours
timeout-minutes: 420 #7hours
steps:
- name: Clone repository
uses: actions/checkout@v2
- name: Download Artifacts
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Prepare - create conda env and install torch
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}
pip install opencompass*.whl
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.2+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install bitsandbytes
pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
pip install xformers==0.0.25.post1 --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: Prepare - Pip install code
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: Prepare - prepare data and hf model
run: |

View File

@ -73,6 +73,6 @@ You are expected to get the evaluation results after the inference and evaluatio
**Note**:
- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
and [EngineGenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.

View File

@ -70,6 +70,6 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2
**注:**
- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [EngineGenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py`
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。

View File

@ -60,8 +60,8 @@ class LmdeployPytorchModel(BaseModel):
engine_config.thread_safe = True
if gen_config is not None:
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
from lmdeploy.messages import GenerationConfig
gen_config = GenerationConfig(**gen_config)
self.logger = get_logger()
tm_model = tm.Engine(path, engine_config)
@ -70,6 +70,22 @@ class LmdeployPytorchModel(BaseModel):
tm_model.create_instance() for i in range(concurrency)
]
self.generator_ids = [i + 1 for i in range(concurrency)]
from transformers import GenerationConfig
try:
generation_config = GenerationConfig.from_pretrained(path)
except Exception:
generation_config = None
if generation_config and hasattr(generation_config, 'eos_token_id'):
if gen_config.stop_words is None:
stop_words = []
if isinstance(generation_config.eos_token_id, int):
stop_words.append(generation_config.eos_token_id)
else:
assert isinstance(generation_config.eos_token_id, list)
for token_id in generation_config.eos_token_id:
stop_words.append(token_id)
gen_config.stop_words = stop_words
self.gen_config = gen_config
self.end_str = end_str
self.major_version, self.minor_version, _ = version_info
@ -135,7 +151,7 @@ class LmdeployPytorchModel(BaseModel):
prompt (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
gen_config (EngineGenerationConfig, optional): Generation
gen_config (GenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings

View File

@ -113,8 +113,8 @@ class TurboMindModel(BaseModel):
gen_config['stop_words'] = list(set(stop_words))
gen_config.setdefault('min_new_tokens', 1)
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
from lmdeploy.messages import GenerationConfig
gen_config = GenerationConfig(**gen_config)
results = []
for batch_input in batch_inputs:
@ -160,7 +160,7 @@ class TurboMindModel(BaseModel):
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
gen_config (EngineGenerationConfig, optional): Generation
gen_config (GenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings

View File

@ -115,11 +115,16 @@ class TurboMindModelwithChatTemplate(BaseModel):
batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
stop_words = list(set(self.stop_words + stopping_criteria))
encode_stop_words = []
if stop_words is not None and len(stop_words) > 0:
for words in stop_words:
encode_stop_words += self.tokenizer.encode(words, add_bos=False)
DEFAULT_GEN_CONFIG = {
'max_new_tokens': max_out_len,
'min_new_tokens': 1,
'top_k': 1,
'stop_words': stop_words,
'stop_words': encode_stop_words,
}
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
gen_config.update(self.gen_config)
@ -127,9 +132,8 @@ class TurboMindModelwithChatTemplate(BaseModel):
gen_config['top_k'] = 1000
gen_config['temperature'] = temperature
from lmdeploy.messages import EngineGenerationConfig, GenerationConfig
from lmdeploy.messages import GenerationConfig
gen_config = GenerationConfig(**gen_config)
gen_config = EngineGenerationConfig.From(gen_config, self.tokenizer)
results = []
for batch_message in batch_messages:
@ -160,7 +164,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
prompt (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
gen_config (EngineGenerationConfig, optional): Generation
gen_config (GenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
Returns:
str: The generated string.