Integrate turbomind python api (#484)

* integrate turbomind python api

* update

* update user guide

* update

* fix according to reviewer's comments

* fix error

* fix linting

* update user guide

* remove debug log

---------

Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com>
This commit is contained in:
Lyu Han 2023-11-21 22:34:46 +08:00 committed by GitHub
parent d925748266
commit eb56fd6d16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 552 additions and 115 deletions

3
.gitignore vendored
View File

@ -90,3 +90,6 @@ docs/zh_cn/_build/
# sft config ignore list # sft config ignore list
configs/sft_cfg/*B_* configs/sft_cfg/*B_*
configs/cky/ configs/cky/
# path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
turbomind/

View File

@ -1,32 +0,0 @@
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = [*gsm8k_datasets]
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
models = [
dict(
type=TurboMindModel,
abbr='internlm-chat-7b-turbomind',
path="internlm-chat-7b",
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,116 @@
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.race.race_gen_69ee4f import race_datasets
from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
# config for internlm-chat-7b
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-chat-7b-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# meta_template=meta_template,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-chat-7b-w4 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-chat-7b-w4-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# meta_template=meta_template,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-chat-7b-w4kv8 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-chat-7b-w4kv8-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# meta_template=meta_template,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-chat-20b
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-chat-20b-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=8,
# concurrency=8,
# meta_template=meta_template,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-chat-20b-w4 model
models = [
dict(
type=TurboMindModel,
abbr='internlm-chat-20b-w4-turbomind',
path="./turbomind",
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
# config for internlm-chat-20b-w4kv8 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-chat-20b-w4kv8-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=16,
# concurrency=16,
# meta_template=meta_template,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]

View File

@ -0,0 +1,40 @@
from mmengine.config import read_base
from opencompass.models.turbomind_tis import TurboMindTisModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_6dc406 import WSC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.race.race_gen_69ee4f import race_datasets
from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
models = [
dict(
type=TurboMindTisModel,
abbr='internlm-chat-20b-turbomind',
path="internlm",
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,101 @@
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# # config for internlm-7b model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-7b-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# # config for internlm-7b-w4 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-7b-w4-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# # config for internlm-7b-w4kv8 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-7b-w4kv8-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=32,
# concurrency=32,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-20b model
models = [
dict(
type=TurboMindModel,
abbr='internlm-20b-turbomind',
path="./turbomind",
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
# config for internlm-20b-w4 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-20b-w4-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=16,
# concurrency=16,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-20b-w4kv8 model
# models = [
# dict(
# type=TurboMindModel,
# abbr='internlm-20b-w4kv8-turbomind',
# path="./turbomind",
# max_out_len=100,
# max_seq_len=2048,
# batch_size=16,
# concurrency=16,
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]

View File

@ -0,0 +1,28 @@
from mmengine.config import read_base
from opencompass.models.turbomind_tis import TurboMindTisModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# and output the results in a choosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = [
dict(
type=TurboMindTisModel,
abbr='internlm-chat-20b-turbomind',
path="internlm",
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -18,42 +18,62 @@ pip install lmdeploy
## Evaluation ## Evaluation
We take the InternLM as example. OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended.
### Step-1: Get InternLM model We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format:
```shell ```shell
# 1. Download InternLM model(or use the cached model's checkpoint) # 1. Download InternLM model(or use the cached model's checkpoint)
# Make sure you have git-lfs installed (https://git-lfs.com) # Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install git lfs install
git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
# if you want to clone without large files just their pointers
# prepend your git clone with the following env var:
GIT_LFS_SKIP_SMUDGE=1
# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
### Step-2: Launch Triton Inference Server **Note**:
If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
```shell ```shell
bash ./workspace/service_docker_up.sh lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
\*\*Note: \*\*In the implementation of turbomind, inference is "persistent". The "destroy" operation can lead to unexpected issues. Therefore, we temporarily use service interfaces for model evaluation. And we will integrate the Python API to OpenCompass when turbomind supports "destroy". ### Evaluation with Turbomind Python API (recommended)
### Step-3: Evaluate the Converted Model In the home folder of OpenCompass, start evaluation by the following command:
In the home folder of OpenCompass
```shell ```shell
python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
``` ```
You are expected to get the evaluation results after the inference and evaluation. You are expected to get the evaluation results after the inference and evaluation.
\*\*Note: \*\*In `eval_internlm_chat_7b_turbomind.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched. **Note**:
- If you evaluate theInternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by commenting out the configuration for the 20B model and enabling the configuration for the 7B model.
### Evaluation with Turbomind gPRC API (optional)
In the home folder of OpenCompass, launch the Triton Inference Server:
```shell
bash turbomind/service_docker_up.sh
```
And start evaluation by the following command:
```shell
python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b
```
\*\*Note: \*\*
- If the InternLM Chat model is requested to be evaluated, please use config file `eval_internlm_chat_turbomind_tis.py`
- In `eval_internlm_turbomind_tis.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched.
- If evaluating the InternLM 7B model, please modify the config file, commenting out the configuration for the 20B model and enabling the configuration for the 7B model

View File

@ -18,42 +18,59 @@ pip install lmdeploy
## 评测 ## 评测
我们使用 InternLM 作为例子来介绍如何评测。 OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。
### 第一步: 获取 InternLM 模型 下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型,并转换为 turbomind 模型格式:
```shell ```shell
# 1. Download InternLM model(or use the cached model's checkpoint) # 1. Download InternLM model(or use the cached model's checkpoint)
# Make sure you have git-lfs installed (https://git-lfs.com) # Make sure you have git-lfs installed (https://git-lfs.com)
git lfs install git lfs install
git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
# if you want to clone without large files just their pointers
# prepend your git clone with the following env var:
GIT_LFS_SKIP_SMUDGE=1
# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
lmdeploy convert internlm /path/to/internlm-20b \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
### 第二步: 启动 TurboMind 的 Triton Inference Server 注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是:
```shell ```shell
bash ./workspace/service_docker_up.sh lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
--dst-path {/home/folder/of/opencompass}/turbomind
``` ```
**注:** turbomind 的实现中,推理是“永驻”的。销毁操作会导致意想不到的问题发生。因此,我们暂时使用服务接口对接模型评测,待 turbomind 支持“销毁”之后,再提供 python API对接方式。 ### 通过 TurboMind Python API 评测(推荐)
### 第三步: 评测转换后的模型 在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
在 OpenCompass 项目目录执行:
```shell ```shell
python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
``` ```
当模型完成推理和指标计算后,我们便可获得模型的评测结果。 **注:**
**注:** `eval_internlm_chat_7b_turbomind.py` 中,配置的 triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:63337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。 - 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py`
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。把 20B 模型的配置注释掉,打开 7B 模型的配置。
### 通过 TurboMind gPRC API 评测(可选)
在 OpenCompass 的项目目录下,启动 triton inference server
```shell
bash turbomind/service_docker_up.sh
```
然后,执行如下命令进行评测:
```shell
python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b
``
**注:**
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind_tis.py`
- 在配置文件中triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:33337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_xxx_turbomind_tis.py`。把其中 20B 模型的配置注释掉,打开 7B 模型的配置。
```

View File

@ -17,5 +17,7 @@ from .minimax_api import MiniMax # noqa: F401
from .openai_api import OpenAI # noqa: F401 from .openai_api import OpenAI # noqa: F401
from .pangu_api import PanGu # noqa: F401 from .pangu_api import PanGu # noqa: F401
from .sensetime_api import SenseTime # noqa: F401 from .sensetime_api import SenseTime # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
from .turbomind_tis import TurboMindTisModel # noqa: F401
from .xunfei_api import XunFei # noqa: F401 from .xunfei_api import XunFei # noqa: F401
from .zhipuai_api import ZhiPuAI # noqa: F401 from .zhipuai_api import ZhiPuAI # noqa: F401

View File

@ -1,11 +1,8 @@
import logging import os.path as osp
import threading
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
from lmdeploy.serve.turbomind.chatbot import Chatbot from opencompass.models.base import BaseModel
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.utils.logging import get_logger from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList from opencompass.utils.prompt import PromptList
@ -23,71 +20,86 @@ def valid_str(string, coding='utf-8'):
class TurboMindModel(BaseModel): class TurboMindModel(BaseModel):
"""Model wrapper for TurboMind API. """Model wrapper for TurboMind Python API.
Args: Args:
path (str): The name of OpenAI's model. path (str): path of the turbomind model
model_path (str): folder of the turbomind model's path concurrency (int): the maximum allowed concurrency of turbomind.
max_seq_len (int): The maximum allowed sequence length of a model. max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048. this value. Defaults to 2048.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
retry (int): Number of retires if the API call fails. Defaults to 2.
meta_template (Dict, optional): The model's meta prompt meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or template if needed, in case the requirement of injecting or
wrapping of any meta instructions. wrapping of any meta instructions.
""" """
is_api: bool = True
def __init__( def __init__(
self, self,
path: str, path: str,
tis_addr: str = '0.0.0.0:33337', concurrency: int = 8,
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
): ):
from lmdeploy import turbomind as tm
from lmdeploy.tokenizer import Tokenizer
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
meta_template=meta_template) meta_template=meta_template)
self.logger = get_logger() self.logger = get_logger()
self.template_parser = LMTemplateParser(meta_template) tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer')
self.eos_token_id = None self.tokenizer = Tokenizer(tokenizer_model_path)
if meta_template and 'eos_token_id' in meta_template: tm_model = tm.TurboMind(path)
self.eos_token_id = meta_template['eos_token_id'] self.generators = [
self.tis_addr = tis_addr tm_model.create_instance() for i in range(concurrency)
]
self.generator_ids = [i + 1 for i in range(concurrency)]
self.generation_kwargs = dict()
def generate( def generate(
self, self,
inputs: List[str or PromptList], inputs: List[str],
max_out_len: int = 512, max_out_len: int = 512,
temperature: float = 1.0, temperature: float = 1.0,
) -> List[str]: ) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts. inputs (List[str]): A list of prompts
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output. max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use, temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7. focused and deterministic. Defaults to 1.0.
Returns: Returns:
List[str]: A list of generated strings. List[str]: A list of generated strings.
""" """
assert isinstance(
inputs, List), f'List(str) is expected, but got {type(inputs)}'
with ThreadPoolExecutor() as executor: # split inputs into batches
results = list( batch_size = len(self.generators)
executor.map(self._generate, inputs, batch_inputs = [
[max_out_len] * len(inputs), inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
[temperature] * len(inputs))) ]
results = []
for batch_input in batch_inputs:
with ThreadPoolExecutor() as executor:
_results = list(
executor.map(self._generate,
self.generators[:len(batch_input)],
self.generator_ids[:len(batch_input)],
batch_input, [max_out_len] * len(batch_input),
[temperature] * len(batch_input)))
results += _results
return results return results
def get_token_len(self, prompt: str) -> int:
input_ids = self.tokenizer.encode(prompt)
return len(input_ids)
def wait(self): def wait(self):
"""Wait till the next query can be sent. """Wait till the next query can be sent.
@ -95,8 +107,8 @@ class TurboMindModel(BaseModel):
""" """
return self.token_bucket.get_token() return self.token_bucket.get_token()
def _generate(self, prompt: str or PromptList, max_out_len: int, def _generate(self, generator, session_id, prompt: str or PromptList,
temperature: float) -> str: max_out_len: int, temperature: float) -> str:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
@ -113,20 +125,20 @@ class TurboMindModel(BaseModel):
str: The generated string. str: The generated string.
""" """
assert type( assert type(
prompt) is str, 'We only support string for TurboMind RPC API' prompt) is str, 'We only support string for TurboMind Python API'
chatbot = Chatbot(self.tis_addr,
temperature=temperature,
capability='completion',
top_k=1,
log_level=logging.ERROR)
for status, text, n_token in chatbot.stream_infer( prompt = '<BOS>' + prompt
session_id=threading.currentThread().ident, input_ids = self.tokenizer.encode(prompt)
prompt=prompt,
request_output_len=max_out_len, for outputs in generator.stream_infer(session_id=session_id,
sequence_start=True, input_ids=[input_ids],
sequence_end=True): request_output_len=max_out_len,
continue sequence_start=True,
response = valid_str(text) sequence_end=True,
response = response.replace('<eoa>', '') top_k=1,
step=0,
stream_output=False):
output_ids, _ = outputs[0]
response = self.tokenizer.decode(output_ids.tolist())
response = valid_str(response)
return response return response

View File

@ -0,0 +1,130 @@
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret
class TurboMindTisModel(BaseModel):
"""Model wrapper for TurboMind Triton Inference Server gRPC API.
Args:
path (str): The name of OpenAI's model.
tis_addr (str): The address (ip:port format) of turbomind's
triton inference server
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
"""
is_api: bool = True
def __init__(
self,
path: str,
tis_addr: str = '0.0.0.0:33337',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template)
self.logger = get_logger()
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.tis_addr = tis_addr
self.generation_kwargs = dict()
def generate(
self,
inputs: List[str or PromptList],
max_out_len: int = 512,
temperature: float = 1.0,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs)))
return results
def wait(self):
"""Wait till the next query can be sent.
Applicable in both single-thread and multi-thread environments.
"""
return self.token_bucket.get_token()
def _generate(self, prompt: str or PromptList, max_out_len: int,
temperature: float) -> str:
"""Generate results given a list of inputs.
Args:
prompt (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert type(
prompt) is str, 'We only support string for TurboMind RPC API'
from lmdeploy.serve.turbomind.chatbot import Chatbot
chatbot = Chatbot(self.tis_addr,
temperature=temperature,
capability='completion',
top_k=1,
log_level=logging.ERROR)
for status, text, n_token in chatbot.stream_infer(
session_id=threading.currentThread().ident,
prompt=prompt,
request_output_len=max_out_len,
sequence_start=True,
sequence_end=True):
continue
response = valid_str(text)
response = response.replace('<eoa>', '')
return response