[Feature] Update evaluate turbomind (#804)

* update

* fix

* fix

* fix
This commit is contained in:
RunningLeon 2024-01-17 11:09:50 +08:00 committed by GitHub
parent 814b3f73bd
commit 0836aec67b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 58 additions and 54 deletions

View File

@ -48,7 +48,7 @@ baichuan2_meta_template = dict(round=[
internlm_chat_7b = dict( internlm_chat_7b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-7b-turbomind', abbr='internlm-chat-7b-turbomind',
path='./turbomind', path='internlm/internlm-chat-7b',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -60,7 +60,7 @@ internlm_chat_7b = dict(
internlm_chat_7b_w4 = dict( internlm_chat_7b_w4 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-7b-w4-turbomind', abbr='internlm-chat-7b-w4-turbomind',
path='./turbomind', path='internlm/internlm-chat-7b-w4',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -73,7 +73,7 @@ internlm_chat_7b_w4 = dict(
internlm_chat_7b_w4kv8 = dict( internlm_chat_7b_w4kv8 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-7b-w4kv8-turbomind', abbr='internlm-chat-7b-w4kv8-turbomind',
path='./turbomind', path='internlm/internlm-chat-7b-w4kv8',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -86,7 +86,7 @@ internlm_chat_7b_w4kv8 = dict(
internlm_chat_20b = dict( internlm_chat_20b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-20b-turbomind', abbr='internlm-chat-20b-turbomind',
path='./turbomind', path='internlm/internlm-chat-20b',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
@ -99,7 +99,7 @@ internlm_chat_20b = dict(
internlm_chat_20b_w4 = dict( internlm_chat_20b_w4 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-20b-w4-turbomind', abbr='internlm-chat-20b-w4-turbomind',
path='./turbomind', path='internlm/internlm-chat-20b-w4',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -112,7 +112,7 @@ internlm_chat_20b_w4 = dict(
internlm_chat_20b_w4kv8 = dict( internlm_chat_20b_w4kv8 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-chat-20b-w4kv8-turbomind', abbr='internlm-chat-20b-w4kv8-turbomind',
path='./turbomind', path='internlm/internlm-chat-20b-w4kv8',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -125,7 +125,7 @@ internlm_chat_20b_w4kv8 = dict(
llama2_chat_7b = dict( llama2_chat_7b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='llama2-chat-7b-turbomind', abbr='llama2-chat-7b-turbomind',
path='./turbomind', path='meta-llama/Llama-2-7b-chat-hf',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -138,7 +138,7 @@ llama2_chat_7b = dict(
llama2_chat_13b = dict( llama2_chat_13b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='llama2-chat-13b-turbomind', abbr='llama2-chat-13b-turbomind',
path='./turbomind', path='meta-llama/Llama-2-13b-chat-hf',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -151,7 +151,7 @@ llama2_chat_13b = dict(
llama2_chat_70b = dict( llama2_chat_70b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='llama2-chat-70b-turbomind', abbr='llama2-chat-70b-turbomind',
path='./turbomind', path='meta-llama/Llama-2-70b-chat-hf',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
@ -164,7 +164,7 @@ llama2_chat_70b = dict(
qwen_chat_7b = dict( qwen_chat_7b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='qwen-chat-7b-turbomind', abbr='qwen-chat-7b-turbomind',
path='./turbomind', path='Qwen/Qwen-7B-Chat',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -177,7 +177,7 @@ qwen_chat_7b = dict(
qwen_chat_14b = dict( qwen_chat_14b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='qwen-chat-14b-turbomind', abbr='qwen-chat-14b-turbomind',
path='./turbomind', path='Qwen/Qwen-14B-Chat',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -190,7 +190,7 @@ qwen_chat_14b = dict(
baichuan2_chat_7b = dict( baichuan2_chat_7b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='baichuan2-chat-7b-turbomind', abbr='baichuan2-chat-7b-turbomind',
path='./turbomind', path='baichuan-inc/Baichuan2-7B-Chat',
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,

View File

@ -19,7 +19,7 @@ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_7b = dict( internlm_7b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-7b-turbomind', abbr='internlm-7b-turbomind',
path="./turbomind", path="internlm/internlm-7b",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -31,7 +31,7 @@ internlm_7b = dict(
internlm_7b_w4 = dict( internlm_7b_w4 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-7b-w4-turbomind', abbr='internlm-7b-w4-turbomind',
path="./turbomind", path="internlm/internlm-7b-w4",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -43,7 +43,7 @@ internlm_7b_w4 = dict(
internlm_7b_w4kv8 = dict( internlm_7b_w4kv8 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-7b-w4kv8-turbomind', abbr='internlm-7b-w4kv8-turbomind',
path="./turbomind", path="internlm/internlm-7b-w4kv8",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=32, batch_size=32,
@ -55,7 +55,7 @@ internlm_7b_w4kv8 = dict(
internlm_20b = dict( internlm_20b = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-20b-turbomind', abbr='internlm-20b-turbomind',
path="./turbomind", path="internlm/internlm-20b",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
@ -67,7 +67,7 @@ internlm_20b = dict(
internlm_20b_w4 = dict( internlm_20b_w4 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-20b-w4-turbomind', abbr='internlm-20b-w4-turbomind',
path="./turbomind", path="internlm/internlm-20b-w4",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,
@ -80,7 +80,7 @@ internlm_20b_w4 = dict(
internlm_20b_w4kv8 = dict( internlm_20b_w4kv8 = dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-20b-w4kv8-turbomind', abbr='internlm-20b-w4kv8-turbomind',
path="./turbomind", path="internlm/internlm-20b-w4kv8",
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=16, batch_size=16,

View File

@ -20,27 +20,14 @@ pip install lmdeploy
OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended. OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended.
We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format: We take the InternLM-20B as example. Please download it from huggingface:
```shell ```shell
# 1. Download InternLM model(or use the cached model's checkpoint) # Download InternLM model(or use the cached model's checkpoint)
# Make sure you have git-lfs installed (https://git-lfs.com) # Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install git lfs install
git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
```
**Note**:
If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
```shell
lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
### Evaluation with Turbomind Python API (recommended) ### Evaluation with Turbomind Python API (recommended)
@ -61,6 +48,22 @@ You are expected to get the evaluation results after the inference and evaluatio
### Evaluation with Turbomind gPRC API (optional) ### Evaluation with Turbomind gPRC API (optional)
Convert model to TurboMind format using lmdeploy
```shell
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
```
**Note**:
If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
```shell
lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
```
In the home folder of OpenCompass, launch the Triton Inference Server: In the home folder of OpenCompass, launch the Triton Inference Server:
```shell ```shell

View File

@ -20,25 +20,14 @@ pip install lmdeploy
OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。 OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。
下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型,并转换为 turbomind 模型格式 下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型:
```shell ```shell
# 1. Download InternLM model(or use the cached model's checkpoint) Download InternLM model(or use the cached model's checkpoint)
# Make sure you have git-lfs installed (https://git-lfs.com) # Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install git lfs install
git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
```
注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是:
```shell
lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
### 通过 TurboMind Python API 评测(推荐) ### 通过 TurboMind Python API 评测(推荐)
@ -57,6 +46,20 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2
### 通过 TurboMind gPRC API 评测(可选) ### 通过 TurboMind gPRC API 评测(可选)
首先需要将模型转换为 turbomind 格式
```shell script
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
```
注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是:
```shell
lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
```
在 OpenCompass 的项目目录下,启动 triton inference server 在 OpenCompass 的项目目录下,启动 triton inference server
```shell ```shell

View File

@ -1,4 +1,3 @@
import os.path as osp
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
@ -40,16 +39,14 @@ class TurboMindModel(BaseModel):
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
): ):
from lmdeploy import turbomind as tm from lmdeploy.turbomind import TurboMind
from lmdeploy.tokenizer import Tokenizer
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
meta_template=meta_template) meta_template=meta_template)
self.logger = get_logger() self.logger = get_logger()
tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer') tm_model = TurboMind.from_pretrained(path)
self.tokenizer = Tokenizer(tokenizer_model_path) self.tokenizer = tm_model.tokenizer
tm_model = tm.TurboMind(path)
self.generators = [ self.generators = [
tm_model.create_instance() for i in range(concurrency) tm_model.create_instance() for i in range(concurrency)
] ]
@ -134,9 +131,10 @@ class TurboMindModel(BaseModel):
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
top_k=1, top_k=1,
top_p=0.8,
step=0, step=0,
stream_output=False): stream_output=False):
output_ids, _ = outputs[0] _, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids.tolist()) response = self.tokenizer.decode(output_ids)
response = valid_str(response) response = valid_str(response)
return response return response