Integrate turbomind inference via its RPC API instead of its python API (#414)

* support tis

* integrate turbomind inference via its RPC API instead of its python API

* update guide

* update ip address spec

* update according to reviewer's comments
This commit is contained in:
Lyu Han 2023-10-07 10:27:48 +08:00 committed by GitHub
parent 9db5652638
commit 6738247142
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 72 deletions

View File

@ -3,30 +3,30 @@ from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.SuperGLUE_CB.SuperGLUE_CB_gen import CB_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = [*CB_datasets]
datasets = [*gsm8k_datasets]
_meta_template = dict(
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
eos_token_id=103028)
models = [
dict(
type=TurboMindModel,
abbr='internlm-chat-7b-tb',
abbr='internlm-chat-7b-turbomind',
path="internlm-chat-7b",
model_path='./workspace',
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
meta_template=_meta_template,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -38,12 +38,14 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
```
### Step-2: Verify the Converted Model
### Step-2: Launch Triton Inference Server
```shell
python -m lmdeploy.turbomind.chat ./workspace
bash ./workspace/service_docker_up.sh
```
\*\*Note: \*\*In the implementation of turbomind, inference is "persistent". The "destroy" operation can lead to unexpected issues. Therefore, we temporarily use service interfaces for model evaluation. And we will integrate the Python API to OpenCompass when turbomind supports "destroy".
### Step-3: Evaluate the Converted Model
In the home folder of OpenCompass
@ -53,3 +55,5 @@ python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind
```
You are expected to get the evaluation results after the inference and evaluation.
\*\*Note: \*\*In `eval_internlm_chat_7b_turbomind.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched.

View File

@ -38,12 +38,14 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
```
### 第二步: 验证转换后的模型
### 第二步: 启动 TurboMind 的 Triton Inference Server
```shell
python -m lmdeploy.turbomind.chat ./workspace
bash ./workspace/service_docker_up.sh
```
**注:** turbomind 的实现中,推理是“永驻”的。销毁操作会导致意想不到的问题发生。因此,我们暂时使用服务接口对接模型评测,待 turbomind 支持“销毁”之后,再提供 python API对接方式。
### 第三步: 评测转换后的模型
在 OpenCompass 项目目录执行:
@ -53,3 +55,5 @@ python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind
```
当模型完成推理和指标计算后,我们便可获得模型的评测结果。
**注:** `eval_internlm_chat_7b_turbomind.py` 中,配置的 triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:63337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。

View File

@ -1,10 +1,11 @@
import os.path as osp
import random
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel
from opencompass.models.base_api import TokenBucket
from lmdeploy.serve.turbomind.chatbot import Chatbot
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
@ -43,10 +44,8 @@ class TurboMindModel(BaseModel):
def __init__(
self,
path: str,
model_path: str,
tis_addr: str = '0.0.0.0:33337',
max_seq_len: int = 2048,
query_per_second: int = 1,
retry: int = 2,
meta_template: Optional[Dict] = None,
):
@ -54,27 +53,17 @@ class TurboMindModel(BaseModel):
max_seq_len=max_seq_len,
meta_template=meta_template)
self.logger = get_logger()
from lmdeploy import turbomind as tm
from lmdeploy.model import MODELS as LMMODELS
from lmdeploy.turbomind.tokenizer import Tokenizer as LMTokenizer
self.retry = retry
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
self.tokenizer = LMTokenizer(tokenizer_model_path)
tm_model = tm.TurboMind(model_path, eos_id=self.tokenizer.eos_token_id)
self.model_name = tm_model.model_name
self.model = LMMODELS.get(self.model_name)()
self.generator = tm_model.create_instance()
self.token_bucket = TokenBucket(query_per_second)
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.tis_addr = tis_addr
def generate(
self,
inputs: List[str or PromptList],
max_out_len: int = 512,
temperature: float = 0.0,
temperature: float = 1.0,
) -> List[str]:
"""Generate results given a list of inputs.
@ -91,10 +80,10 @@ class TurboMindModel(BaseModel):
Returns:
List[str]: A list of generated strings.
"""
prompts = inputs
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, prompts,
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs)))
return results
@ -106,12 +95,12 @@ class TurboMindModel(BaseModel):
"""
return self.token_bucket.get_token()
def _generate(self, input: str or PromptList, max_out_len: int,
def _generate(self, prompt: str or PromptList, max_out_len: int,
temperature: float) -> str:
"""Generate results given a list of inputs.
Args:
inputs (str or PromptList): A string or PromptDict.
prompt (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
@ -123,39 +112,21 @@ class TurboMindModel(BaseModel):
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
assert type(
input
) is str, 'We only support string for TurboMind Python API now'
intput_token_ids = self.tokenizer.encode(input)
for _ in range(self.retry):
self.wait()
session_id = random.randint(1, 100000)
nth_round = 0
for outputs in self.generator.stream_infer(
session_id=session_id,
input_ids=[intput_token_ids],
stream_output=False,
request_output_len=max_out_len,
sequence_start=(nth_round == 0),
sequence_end=False,
step=0,
stop=False,
top_k=40,
top_p=0.8,
temperature=temperature,
repetition_penalty=1.0,
ignore_eos=False,
random_seed=random.getrandbits(64)
if nth_round == 0 else None):
pass
output_token_ids, _ = outputs[0]
# decode output_token_ids
response = self.tokenizer.decode(output_token_ids)
response = valid_str(response)
prompt) is str, 'We only support string for TurboMind RPC API'
chatbot = Chatbot(self.tis_addr,
temperature=temperature,
capability='completion',
top_k=1,
log_level=logging.ERROR)
for status, text, n_token in chatbot.stream_infer(
session_id=threading.currentThread().ident,
prompt=prompt,
request_output_len=max_out_len,
sequence_start=True,
sequence_end=True):
continue
response = valid_str(text)
response = response.replace('<eoa>', '')
return response