diff --git a/.gitignore b/.gitignore index 8e81c083..4dc68385 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,6 @@ docs/zh_cn/_build/ # sft config ignore list configs/sft_cfg/*B_* configs/cky/ + +# path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy` +turbomind/ diff --git a/configs/eval_internlm_chat_7b_turbomind.py b/configs/eval_internlm_chat_7b_turbomind.py deleted file mode 100644 index dddc9898..00000000 --- a/configs/eval_internlm_chat_7b_turbomind.py +++ /dev/null @@ -1,32 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.turbomind import TurboMindModel - -with read_base(): - # choose a list of datasets - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - # and output the results in a choosen format - from .summarizers.medium import summarizer - -datasets = [*gsm8k_datasets] - - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], - eos_token_id=103028) - -models = [ - dict( - type=TurboMindModel, - abbr='internlm-chat-7b-turbomind', - path="internlm-chat-7b", - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=16, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/eval_internlm_chat_turbomind.py b/configs/eval_internlm_chat_turbomind.py new file mode 100644 index 00000000..73b76447 --- /dev/null +++ b/configs/eval_internlm_chat_turbomind.py @@ -0,0 +1,116 @@ +from mmengine.config import read_base +from opencompass.models.turbomind import TurboMindModel + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + from .datasets.race.race_gen_69ee4f import race_datasets + from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets + # and output the results in a choosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), + ], + eos_token_id=103028) + +# config for internlm-chat-7b +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-chat-7b-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# meta_template=meta_template, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# config for internlm-chat-7b-w4 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-chat-7b-w4-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# meta_template=meta_template, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# config for internlm-chat-7b-w4kv8 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-chat-7b-w4kv8-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# meta_template=meta_template, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# config for internlm-chat-20b +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-chat-20b-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=8, +# concurrency=8, +# meta_template=meta_template, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# config for internlm-chat-20b-w4 model +models = [ + dict( + type=TurboMindModel, + abbr='internlm-chat-20b-w4-turbomind', + path="./turbomind", + max_out_len=100, + max_seq_len=2048, + batch_size=16, + concurrency=16, + meta_template=meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +# config for internlm-chat-20b-w4kv8 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-chat-20b-w4kv8-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=16, +# concurrency=16, +# meta_template=meta_template, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] diff --git a/configs/eval_internlm_chat_turbomind_tis.py b/configs/eval_internlm_chat_turbomind_tis.py new file mode 100644 index 00000000..05a969b9 --- /dev/null +++ b/configs/eval_internlm_chat_turbomind_tis.py @@ -0,0 +1,40 @@ +from mmengine.config import read_base +from opencompass.models.turbomind_tis import TurboMindTisModel + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_6dc406 import WSC_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + from .datasets.race.race_gen_69ee4f import race_datasets + from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets + # and output the results in a choosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), + ], + eos_token_id=103028) + +models = [ + dict( + type=TurboMindTisModel, + abbr='internlm-chat-20b-turbomind', + path="internlm", + tis_addr='0.0.0.0:33337', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + meta_template=meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/eval_internlm_turbomind.py b/configs/eval_internlm_turbomind.py new file mode 100644 index 00000000..8d43321c --- /dev/null +++ b/configs/eval_internlm_turbomind.py @@ -0,0 +1,101 @@ +from mmengine.config import read_base +from opencompass.models.turbomind import TurboMindModel + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # and output the results in a choosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +# # config for internlm-7b model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-7b-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# # config for internlm-7b-w4 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-7b-w4-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# # config for internlm-7b-w4kv8 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-7b-w4kv8-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=32, +# concurrency=32, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + +# config for internlm-20b model +models = [ + dict( + type=TurboMindModel, + abbr='internlm-20b-turbomind', + path="./turbomind", + max_out_len=100, + max_seq_len=2048, + batch_size=8, + concurrency=8, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +# config for internlm-20b-w4 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-20b-w4-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=16, +# concurrency=16, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] + + +# config for internlm-20b-w4kv8 model +# models = [ +# dict( +# type=TurboMindModel, +# abbr='internlm-20b-w4kv8-turbomind', +# path="./turbomind", +# max_out_len=100, +# max_seq_len=2048, +# batch_size=16, +# concurrency=16, +# run_cfg=dict(num_gpus=1, num_procs=1), +# ) +# ] diff --git a/configs/eval_internlm_turbomind_tis.py b/configs/eval_internlm_turbomind_tis.py new file mode 100644 index 00000000..9ac7299b --- /dev/null +++ b/configs/eval_internlm_turbomind_tis.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base +from opencompass.models.turbomind_tis import TurboMindTisModel + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # and output the results in a choosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +models = [ + dict( + type=TurboMindTisModel, + abbr='internlm-chat-20b-turbomind', + path="internlm", + tis_addr='0.0.0.0:33337', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md index 33774357..01b0c15f 100644 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ b/docs/en/advanced_guides/evaluation_turbomind.md @@ -18,42 +18,62 @@ pip install lmdeploy ## Evaluation -We take the InternLM as example. +OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended. -### Step-1: Get InternLM model +We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format: ```shell # 1. Download InternLM model(or use the cached model's checkpoint) # Make sure you have git-lfs installed (https://git-lfs.com) git lfs install -git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b - -# if you want to clone without large files – just their pointers -# prepend your git clone with the following env var: -GIT_LFS_SKIP_SMUDGE=1 - -# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b +# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass +lmdeploy convert internlm /path/to/internlm-20b \ + --dst-path {/home/folder/of/opencompass}/turbomind ``` -### Step-2: Launch Triton Inference Server +**Note**: + +If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is: ```shell -bash ./workspace/service_docker_up.sh +lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ + --dst-path {/home/folder/of/opencompass}/turbomind ``` -\*\*Note: \*\*In the implementation of turbomind, inference is "persistent". The "destroy" operation can lead to unexpected issues. Therefore, we temporarily use service interfaces for model evaluation. And we will integrate the Python API to OpenCompass when turbomind supports "destroy". +### Evaluation with Turbomind Python API (recommended) -### Step-3: Evaluate the Converted Model - -In the home folder of OpenCompass +In the home folder of OpenCompass, start evaluation by the following command: ```shell -python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind +python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b ``` You are expected to get the evaluation results after the inference and evaluation. -\*\*Note: \*\*In `eval_internlm_chat_7b_turbomind.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched. +**Note**: + +- If you evaluate theInternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` +- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by commenting out the configuration for the 20B model and enabling the configuration for the 7B model. + +### Evaluation with Turbomind gPRC API (optional) + +In the home folder of OpenCompass, launch the Triton Inference Server: + +```shell +bash turbomind/service_docker_up.sh +``` + +And start evaluation by the following command: + +```shell +python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b +``` + +\*\*Note: \*\* + +- If the InternLM Chat model is requested to be evaluated, please use config file `eval_internlm_chat_turbomind_tis.py` +- In `eval_internlm_turbomind_tis.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched. +- If evaluating the InternLM 7B model, please modify the config file, commenting out the configuration for the 20B model and enabling the configuration for the 7B model diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md index 969c7911..74da26ea 100644 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md @@ -18,42 +18,59 @@ pip install lmdeploy ## 评测 -我们使用 InternLM 作为例子来介绍如何评测。 +OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。 -### 第一步: 获取 InternLM 模型 +下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型,并转换为 turbomind 模型格式: ```shell # 1. Download InternLM model(or use the cached model's checkpoint) # Make sure you have git-lfs installed (https://git-lfs.com) git lfs install -git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b - -# if you want to clone without large files – just their pointers -# prepend your git clone with the following env var: -GIT_LFS_SKIP_SMUDGE=1 - -# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b +# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass +lmdeploy convert internlm /path/to/internlm-20b \ + --dst-path {/home/folder/of/opencompass}/turbomind ``` -### 第二步: 启动 TurboMind 的 Triton Inference Server +注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是: ```shell -bash ./workspace/service_docker_up.sh +lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ + --dst-path {/home/folder/of/opencompass}/turbomind ``` -**注:** turbomind 的实现中,推理是“永驻”的。销毁操作会导致意想不到的问题发生。因此,我们暂时使用服务接口对接模型评测,待 turbomind 支持“销毁”之后,再提供 python API对接方式。 +### 通过 TurboMind Python API 评测(推荐) -### 第三步: 评测转换后的模型 - -在 OpenCompass 项目目录执行: +在 OpenCompass 的项目目录下,执行如下命令可得到评测结果: ```shell -python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind +python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b ``` -当模型完成推理和指标计算后,我们便可获得模型的评测结果。 +**注:** -**注:** `eval_internlm_chat_7b_turbomind.py` 中,配置的 triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:63337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。 +- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` +- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。把 20B 模型的配置注释掉,打开 7B 模型的配置。 + +### 通过 TurboMind gPRC API 评测(可选) + +在 OpenCompass 的项目目录下,启动 triton inference server: + +```shell +bash turbomind/service_docker_up.sh +``` + +然后,执行如下命令进行评测: + +```shell +python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b +`` + +**注:** + +- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind_tis.py` +- 在配置文件中,triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:33337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。 +- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_xxx_turbomind_tis.py`。把其中 20B 模型的配置注释掉,打开 7B 模型的配置。 +``` diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 52920dd1..937a37fd 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -17,5 +17,7 @@ from .minimax_api import MiniMax # noqa: F401 from .openai_api import OpenAI # noqa: F401 from .pangu_api import PanGu # noqa: F401 from .sensetime_api import SenseTime # noqa: F401 +from .turbomind import TurboMindModel # noqa: F401 +from .turbomind_tis import TurboMindTisModel # noqa: F401 from .xunfei_api import XunFei # noqa: F401 from .zhipuai_api import ZhiPuAI # noqa: F401 diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index e81399ef..99f3d30c 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,11 +1,8 @@ -import logging -import threading +import os.path as osp from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union -from lmdeploy.serve.turbomind.chatbot import Chatbot - -from opencompass.models.base import BaseModel, LMTemplateParser +from opencompass.models.base import BaseModel from opencompass.utils.logging import get_logger from opencompass.utils.prompt import PromptList @@ -23,71 +20,86 @@ def valid_str(string, coding='utf-8'): class TurboMindModel(BaseModel): - """Model wrapper for TurboMind API. + """Model wrapper for TurboMind Python API. Args: - path (str): The name of OpenAI's model. - model_path (str): folder of the turbomind model's path + path (str): path of the turbomind model + concurrency (int): the maximum allowed concurrency of turbomind. max_seq_len (int): The maximum allowed sequence length of a model. Note that the length of prompt + generated tokens shall not exceed this value. Defaults to 2048. - query_per_second (int): The maximum queries allowed per second - between two consecutive calls of the API. Defaults to 1. - retry (int): Number of retires if the API call fails. Defaults to 2. meta_template (Dict, optional): The model's meta prompt template if needed, in case the requirement of injecting or wrapping of any meta instructions. """ - is_api: bool = True - def __init__( self, path: str, - tis_addr: str = '0.0.0.0:33337', + concurrency: int = 8, max_seq_len: int = 2048, meta_template: Optional[Dict] = None, ): + from lmdeploy import turbomind as tm + from lmdeploy.tokenizer import Tokenizer super().__init__(path=path, max_seq_len=max_seq_len, meta_template=meta_template) self.logger = get_logger() - self.template_parser = LMTemplateParser(meta_template) - self.eos_token_id = None - if meta_template and 'eos_token_id' in meta_template: - self.eos_token_id = meta_template['eos_token_id'] - self.tis_addr = tis_addr + tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer') + self.tokenizer = Tokenizer(tokenizer_model_path) + tm_model = tm.TurboMind(path) + self.generators = [ + tm_model.create_instance() for i in range(concurrency) + ] + self.generator_ids = [i + 1 for i in range(concurrency)] + self.generation_kwargs = dict() def generate( self, - inputs: List[str or PromptList], + inputs: List[str], max_out_len: int = 512, temperature: float = 1.0, ) -> List[str]: """Generate results given a list of inputs. Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. + inputs (List[str]): A list of prompts max_out_len (int): The maximum length of the output. temperature (float): What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more - focused and deterministic. Defaults to 0.7. + focused and deterministic. Defaults to 1.0. Returns: List[str]: A list of generated strings. """ + assert isinstance( + inputs, List), f'List(str) is expected, but got {type(inputs)}' - with ThreadPoolExecutor() as executor: - results = list( - executor.map(self._generate, inputs, - [max_out_len] * len(inputs), - [temperature] * len(inputs))) + # split inputs into batches + batch_size = len(self.generators) + batch_inputs = [ + inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size) + ] + + results = [] + for batch_input in batch_inputs: + with ThreadPoolExecutor() as executor: + _results = list( + executor.map(self._generate, + self.generators[:len(batch_input)], + self.generator_ids[:len(batch_input)], + batch_input, [max_out_len] * len(batch_input), + [temperature] * len(batch_input))) + results += _results return results + def get_token_len(self, prompt: str) -> int: + input_ids = self.tokenizer.encode(prompt) + return len(input_ids) + def wait(self): """Wait till the next query can be sent. @@ -95,8 +107,8 @@ class TurboMindModel(BaseModel): """ return self.token_bucket.get_token() - def _generate(self, prompt: str or PromptList, max_out_len: int, - temperature: float) -> str: + def _generate(self, generator, session_id, prompt: str or PromptList, + max_out_len: int, temperature: float) -> str: """Generate results given a list of inputs. Args: @@ -113,20 +125,20 @@ class TurboMindModel(BaseModel): str: The generated string. """ assert type( - prompt) is str, 'We only support string for TurboMind RPC API' - chatbot = Chatbot(self.tis_addr, - temperature=temperature, - capability='completion', - top_k=1, - log_level=logging.ERROR) + prompt) is str, 'We only support string for TurboMind Python API' - for status, text, n_token in chatbot.stream_infer( - session_id=threading.currentThread().ident, - prompt=prompt, - request_output_len=max_out_len, - sequence_start=True, - sequence_end=True): - continue - response = valid_str(text) - response = response.replace('', '') + prompt = '' + prompt + input_ids = self.tokenizer.encode(prompt) + + for outputs in generator.stream_infer(session_id=session_id, + input_ids=[input_ids], + request_output_len=max_out_len, + sequence_start=True, + sequence_end=True, + top_k=1, + step=0, + stream_output=False): + output_ids, _ = outputs[0] + response = self.tokenizer.decode(output_ids.tolist()) + response = valid_str(response) return response diff --git a/opencompass/models/turbomind_tis.py b/opencompass/models/turbomind_tis.py new file mode 100644 index 00000000..d1a41fbc --- /dev/null +++ b/opencompass/models/turbomind_tis.py @@ -0,0 +1,130 @@ +import logging +import threading +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +from opencompass.models.base import BaseModel, LMTemplateParser +from opencompass.utils.logging import get_logger +from opencompass.utils.prompt import PromptList + +PromptType = Union[PromptList, str] + + +def valid_str(string, coding='utf-8'): + """decode text according to its encoding type.""" + invalid_chars = [b'\xef\xbf\xbd'] + bstr = bytes(string, coding) + for invalid_char in invalid_chars: + bstr = bstr.replace(invalid_char, b'') + ret = bstr.decode(encoding=coding, errors='ignore') + return ret + + +class TurboMindTisModel(BaseModel): + """Model wrapper for TurboMind Triton Inference Server gRPC API. + + Args: + path (str): The name of OpenAI's model. + tis_addr (str): The address (ip:port format) of turbomind's + triton inference server + max_seq_len (int): The maximum allowed sequence length of a model. + Note that the length of prompt + generated tokens shall not exceed + this value. Defaults to 2048. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + """ + + is_api: bool = True + + def __init__( + self, + path: str, + tis_addr: str = '0.0.0.0:33337', + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + meta_template=meta_template) + self.logger = get_logger() + self.template_parser = LMTemplateParser(meta_template) + self.eos_token_id = None + if meta_template and 'eos_token_id' in meta_template: + self.eos_token_id = meta_template['eos_token_id'] + self.tis_addr = tis_addr + self.generation_kwargs = dict() + + def generate( + self, + inputs: List[str or PromptList], + max_out_len: int = 512, + temperature: float = 1.0, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str or PromptList]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + temperature (float): What sampling temperature to use, + between 0 and 2. Higher values like 0.8 will make the output + more random, while lower values like 0.2 will make it more + focused and deterministic. Defaults to 0.7. + + Returns: + List[str]: A list of generated strings. + """ + + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs), + [temperature] * len(inputs))) + return results + + def wait(self): + """Wait till the next query can be sent. + + Applicable in both single-thread and multi-thread environments. + """ + return self.token_bucket.get_token() + + def _generate(self, prompt: str or PromptList, max_out_len: int, + temperature: float) -> str: + """Generate results given a list of inputs. + + Args: + prompt (str or PromptList): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + temperature (float): What sampling temperature to use, + between 0 and 2. Higher values like 0.8 will make the output + more random, while lower values like 0.2 will make it more + focused and deterministic. + + Returns: + str: The generated string. + """ + assert type( + prompt) is str, 'We only support string for TurboMind RPC API' + + from lmdeploy.serve.turbomind.chatbot import Chatbot + chatbot = Chatbot(self.tis_addr, + temperature=temperature, + capability='completion', + top_k=1, + log_level=logging.ERROR) + + for status, text, n_token in chatbot.stream_infer( + session_id=threading.currentThread().ident, + prompt=prompt, + request_output_len=max_out_len, + sequence_start=True, + sequence_end=True): + continue + response = valid_str(text) + response = response.replace('', '') + return response