Integrate turbomind python api (#484)

* integrate turbomind python api * update * update user guide * update * fix according to reviewer's comments * fix error * fix linting * update user guide * remove debug log --------- Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com>
2025-05-30 16:03:24 +08:00 · 2023-11-21 22:34:46 +08:00 · 2023-11-21 22:34:46 +08:00 · eb56fd6d16
commit eb56fd6d16
parent d925748266
11 changed files with 552 additions and 115 deletions
--- a/.gitignore
+++ b/.gitignore
@ -90,3 +90,6 @@ docs/zh_cn/_build/
 # sft config ignore list
 configs/sft_cfg/*B_*
 configs/cky/
 # path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
 turbomind/
--- a/configs/eval_internlm_chat_7b_turbomind.py
+++ b/configs/eval_internlm_chat_7b_turbomind.py
@ -1,32 +0,0 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind import TurboMindModel
 with read_base():
    # choose a list of datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = [*gsm8k_datasets]
 meta_template = dict(
    round=[
        dict(role='HUMAN', begin='<|User|>:', end='\n'),
        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
    ],
    eos_token_id=103028)
 models = [
    dict(
        type=TurboMindModel,
        abbr='internlm-chat-7b-turbomind',
        path="internlm-chat-7b",
        tis_addr='0.0.0.0:33337',
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        meta_template=meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/eval_internlm_chat_turbomind.py
+++ b/configs/eval_internlm_chat_turbomind.py
@ -0,0 +1,116 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind import TurboMindModel
 with read_base():
    # choose a list of datasets
    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
    from .datasets.race.race_gen_69ee4f import race_datasets
    from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 meta_template = dict(
    round=[
        dict(role='HUMAN', begin='<|User|>:', end='\n'),
        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
    ],
    eos_token_id=103028)
 # config for internlm-chat-7b
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-chat-7b-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         meta_template=meta_template,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-chat-7b-w4 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-chat-7b-w4-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         meta_template=meta_template,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-chat-7b-w4kv8 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-chat-7b-w4kv8-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         meta_template=meta_template,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-chat-20b
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-chat-20b-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=8,
 #         concurrency=8,
 #         meta_template=meta_template,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-chat-20b-w4 model
 models = [
    dict(
        type=TurboMindModel,
        abbr='internlm-chat-20b-w4-turbomind',
        path="./turbomind",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=16,
        concurrency=16,
        meta_template=meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
 # config for internlm-chat-20b-w4kv8 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-chat-20b-w4kv8-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=16,
 #         concurrency=16,
 #         meta_template=meta_template,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
--- a/configs/eval_internlm_chat_turbomind_tis.py
+++ b/configs/eval_internlm_chat_turbomind_tis.py
@ -0,0 +1,40 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind_tis import TurboMindTisModel
 with read_base():
    # choose a list of datasets
    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
    from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_6dc406 import WSC_datasets
    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
    from .datasets.race.race_gen_69ee4f import race_datasets
    from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 meta_template = dict(
    round=[
        dict(role='HUMAN', begin='<|User|>:', end='\n'),
        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
    ],
    eos_token_id=103028)
 models = [
    dict(
        type=TurboMindTisModel,
        abbr='internlm-chat-20b-turbomind',
        path="internlm",
        tis_addr='0.0.0.0:33337',
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/eval_internlm_turbomind.py
+++ b/configs/eval_internlm_turbomind.py
@ -0,0 +1,101 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind import TurboMindModel
 with read_base():
    # choose a list of datasets
    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 # # config for internlm-7b model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-7b-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # # config for internlm-7b-w4 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-7b-w4-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # # config for internlm-7b-w4kv8 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-7b-w4kv8-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=32,
 #         concurrency=32,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-20b model
 models = [
    dict(
        type=TurboMindModel,
        abbr='internlm-20b-turbomind',
        path="./turbomind",
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        concurrency=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
 # config for internlm-20b-w4 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-20b-w4-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=16,
 #         concurrency=16,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
 # config for internlm-20b-w4kv8 model
 # models = [
 #     dict(
 #         type=TurboMindModel,
 #         abbr='internlm-20b-w4kv8-turbomind',
 #         path="./turbomind",
 #         max_out_len=100,
 #         max_seq_len=2048,
 #         batch_size=16,
 #         concurrency=16,
 #         run_cfg=dict(num_gpus=1, num_procs=1),
 #     )
 # ]
--- a/configs/eval_internlm_turbomind_tis.py
+++ b/configs/eval_internlm_turbomind_tis.py
@ -0,0 +1,28 @@
 from mmengine.config import read_base
 from opencompass.models.turbomind_tis import TurboMindTisModel
 with read_base():
    # choose a list of datasets
    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
    # and output the results in a choosen format
    from .summarizers.medium import summarizer
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 models = [
    dict(
        type=TurboMindTisModel,
        abbr='internlm-chat-20b-turbomind',
        path="internlm",
        tis_addr='0.0.0.0:33337',
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/docs/en/advanced_guides/evaluation_turbomind.md
+++ b/docs/en/advanced_guides/evaluation_turbomind.md
@ -18,42 +18,62 @@ pip install lmdeploy
 ## Evaluation
-We take the InternLM as example.
+OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended.
-### Step-1: Get InternLM model
+We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format:
 ```shell
 # 1. Download InternLM model(or use the cached model's checkpoint)
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:
 GIT_LFS_SKIP_SMUDGE=1
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
 # 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
 lmdeploy convert internlm /path/to/internlm-20b \
    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
-### Step-2: Launch Triton Inference Server
+**Note**:
 If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
 ```shell
-bash ./workspace/service_docker_up.sh
+lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
-\*\*Note: \*\*In the implementation of turbomind, inference is "persistent". The "destroy" operation can lead to unexpected issues. Therefore, we temporarily use service interfaces for model evaluation. And we will integrate the Python API to OpenCompass when turbomind supports "destroy".
+### Evaluation with Turbomind Python API (recommended)
-### Step-3: Evaluate the Converted Model
+In the home folder of OpenCompass, start evaluation by the following command:
 In the home folder of OpenCompass
 ```shell
-python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind
+python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
 ```
 You are expected to get the evaluation results after the inference and evaluation.
-\*\*Note: \*\*In `eval_internlm_chat_7b_turbomind.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched.
+**Note**:
 - If you evaluate theInternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
 - If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by commenting out the configuration for the 20B model and enabling the configuration for the 7B model.
 ### Evaluation with Turbomind gPRC API (optional)
 In the home folder of OpenCompass, launch the Triton Inference Server:
 ```shell
 bash turbomind/service_docker_up.sh
 ```
 And start evaluation by the following command:
 ```shell
 python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b
 ```
 \*\*Note: \*\*
 - If the InternLM Chat model is requested to be evaluated, please use config file `eval_internlm_chat_turbomind_tis.py`
 - In `eval_internlm_turbomind_tis.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched.
 - If evaluating the InternLM 7B model, please modify the config file, commenting out the configuration for the 20B model and enabling the configuration for the 7B model
--- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md
+++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
@ -18,42 +18,59 @@ pip install lmdeploy
 ## 评测
-我们使用 InternLM 作为例子来介绍如何评测。
+OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。
-### 第一步: 获取 InternLM 模型
+下文以 InternLM-20B 模型为例，介绍如何评测。首先，从 huggingface 上下载 InternLM 模型，并转换为 turbomind 模型格式：
 ```shell
 # 1. Download InternLM model(or use the cached model's checkpoint)
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:
 GIT_LFS_SKIP_SMUDGE=1
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
 # 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
 lmdeploy convert internlm /path/to/internlm-20b \
    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
-### 第二步: 启动 TurboMind 的 Triton Inference Server
+注意：如果评测 InternLM Chat 模型，那么在转换模型格式的时候，模型名字要填写 `internlm-chat`。具体命令是：
 ```shell
-bash ./workspace/service_docker_up.sh
+lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
-**注：** turbomind 的实现中，推理是“永驻”的。销毁操作会导致意想不到的问题发生。因此，我们暂时使用服务接口对接模型评测，待 turbomind 支持“销毁”之后，再提供 python API对接方式。
+### 通过 TurboMind Python API 评测（推荐）
-### 第三步: 评测转换后的模型
+在 OpenCompass 的项目目录下，执行如下命令可得到评测结果：
 在 OpenCompass 项目目录执行：
 ```shell
-python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind
+python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
 ```
-当模型完成推理和指标计算后，我们便可获得模型的评测结果。
+**注：**
-**注：** `eval_internlm_chat_7b_turbomind.py` 中，配置的 triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:63337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。
+- 如果评测 InternLM Chat 模型，请使用配置文件 `eval_internlm_chat_turbomind.py`
 - 如果评测 InternLM 7B 模型，请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。把 20B 模型的配置注释掉，打开 7B 模型的配置。
 ### 通过 TurboMind gPRC API 评测（可选）
 在 OpenCompass 的项目目录下，启动 triton inference server：
 ```shell
 bash turbomind/service_docker_up.sh
 ```
 然后，执行如下命令进行评测：
 ```shell
 python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/internlm-20b
 ``
 **注：**
 - 如果评测 InternLM Chat 模型，请使用配置文件 `eval_internlm_chat_turbomind_tis.py`
 - 在配置文件中，triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:33337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。
 - 如果评测 InternLM 7B 模型，请修改 `eval_internlm_xxx_turbomind_tis.py`。把其中 20B 模型的配置注释掉，打开 7B 模型的配置。
 ```
--- a/opencompass/models/init.py
+++ b/opencompass/models/init.py
@ -17,5 +17,7 @@ from .minimax_api import MiniMax  # noqa: F401
 from .openai_api import OpenAI  # noqa: F401
 from .pangu_api import PanGu  # noqa: F401
 from .sensetime_api import SenseTime  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
 from .turbomind_tis import TurboMindTisModel  # noqa: F401
 from .xunfei_api import XunFei  # noqa: F401
 from .zhipuai_api import ZhiPuAI  # noqa: F401
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@ -1,11 +1,8 @@
-import logging
+import os.path as osp
 import threading
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
-from lmdeploy.serve.turbomind.chatbot import Chatbot
+from opencompass.models.base import BaseModel
 from opencompass.models.base import BaseModel, LMTemplateParser
 from opencompass.utils.logging import get_logger
 from opencompass.utils.prompt import PromptList
@ -23,71 +20,86 @@ def valid_str(string, coding='utf-8'):
 class TurboMindModel(BaseModel):
-    """Model wrapper for TurboMind API.
+    """Model wrapper for TurboMind Python API.
    Args:
-        path (str): The name of OpenAI's model.
+        path (str): path of the turbomind model
-        model_path (str): folder of the turbomind model's path
+        concurrency (int): the maximum allowed concurrency of turbomind.
        max_seq_len (int): The maximum allowed sequence length of a model.
            Note that the length of prompt + generated tokens shall not exceed
            this value. Defaults to 2048.
        query_per_second (int): The maximum queries allowed per second
            between two consecutive calls of the API. Defaults to 1.
        retry (int): Number of retires if the API call fails. Defaults to 2.
        meta_template (Dict, optional): The model's meta prompt
            template if needed, in case the requirement of injecting or
            wrapping of any meta instructions.
    """
    is_api: bool = True
    def __init__(
        self,
        path: str,
-        tis_addr: str = '0.0.0.0:33337',
+        concurrency: int = 8,
        max_seq_len: int = 2048,
        meta_template: Optional[Dict] = None,
    ):
        from lmdeploy import turbomind as tm
        from lmdeploy.tokenizer import Tokenizer
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         meta_template=meta_template)
        self.logger = get_logger()
-        self.template_parser = LMTemplateParser(meta_template)
+        tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer')
-        self.eos_token_id = None
+        self.tokenizer = Tokenizer(tokenizer_model_path)
-        if meta_template and 'eos_token_id' in meta_template:
+        tm_model = tm.TurboMind(path)
-            self.eos_token_id = meta_template['eos_token_id']
+        self.generators = [
-        self.tis_addr = tis_addr
+            tm_model.create_instance() for i in range(concurrency)
        ]
        self.generator_ids = [i + 1 for i in range(concurrency)]
        self.generation_kwargs = dict()
    def generate(
        self,
-        inputs: List[str or PromptList],
+        inputs: List[str],
        max_out_len: int = 512,
        temperature: float = 1.0,
    ) -> List[str]:
        """Generate results given a list of inputs.
        Args:
-            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+            inputs (List[str]): A list of prompts
                The PromptDict should be organized in OpenCompass'
                API format.
            max_out_len (int): The maximum length of the output.
            temperature (float): What sampling temperature to use,
                between 0 and 2. Higher values like 0.8 will make the output
                more random, while lower values like 0.2 will make it more
-                focused and deterministic. Defaults to 0.7.
+                focused and deterministic. Defaults to 1.0.
        Returns:
            List[str]: A list of generated strings.
        """
        assert isinstance(
            inputs, List), f'List(str) is expected, but got {type(inputs)}'
-        with ThreadPoolExecutor() as executor:
+        # split inputs into batches
-            results = list(
+        batch_size = len(self.generators)
-                executor.map(self._generate, inputs,
+        batch_inputs = [
-                             [max_out_len] * len(inputs),
+            inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
-                             [temperature] * len(inputs)))
+        ]
        results = []
        for batch_input in batch_inputs:
            with ThreadPoolExecutor() as executor:
                _results = list(
                    executor.map(self._generate,
                                 self.generators[:len(batch_input)],
                                 self.generator_ids[:len(batch_input)],
                                 batch_input, [max_out_len] * len(batch_input),
                                 [temperature] * len(batch_input)))
                results += _results
        return results
    def get_token_len(self, prompt: str) -> int:
        input_ids = self.tokenizer.encode(prompt)
        return len(input_ids)
    def wait(self):
        """Wait till the next query can be sent.
@ -95,8 +107,8 @@ class TurboMindModel(BaseModel):
        """
        return self.token_bucket.get_token()
-    def _generate(self, prompt: str or PromptList, max_out_len: int,
+    def _generate(self, generator, session_id, prompt: str or PromptList,
-                  temperature: float) -> str:
+                  max_out_len: int, temperature: float) -> str:
        """Generate results given a list of inputs.
        Args:
@ -113,20 +125,20 @@ class TurboMindModel(BaseModel):
            str: The generated string.
        """
        assert type(
-            prompt) is str, 'We only support string for TurboMind RPC API'
+            prompt) is str, 'We only support string for TurboMind Python API'
        chatbot = Chatbot(self.tis_addr,
                          temperature=temperature,
                          capability='completion',
                          top_k=1,
                          log_level=logging.ERROR)
-        for status, text, n_token in chatbot.stream_infer(
+        prompt = '<BOS>' + prompt
-                session_id=threading.currentThread().ident,
+        input_ids = self.tokenizer.encode(prompt)
-                prompt=prompt,
+
-                request_output_len=max_out_len,
+        for outputs in generator.stream_infer(session_id=session_id,
-                sequence_start=True,
+                                              input_ids=[input_ids],
-                sequence_end=True):
+                                              request_output_len=max_out_len,
-            continue
+                                              sequence_start=True,
-        response = valid_str(text)
+                                              sequence_end=True,
-        response = response.replace('<eoa>', '')
+                                              top_k=1,
                                              step=0,
                                              stream_output=False):
            output_ids, _ = outputs[0]
            response = self.tokenizer.decode(output_ids.tolist())
            response = valid_str(response)
        return response
--- a/opencompass/models/turbomind_tis.py
+++ b/opencompass/models/turbomind_tis.py
@ -0,0 +1,130 @@
 import logging
 import threading
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 from opencompass.models.base import BaseModel, LMTemplateParser
 from opencompass.utils.logging import get_logger
 from opencompass.utils.prompt import PromptList
 PromptType = Union[PromptList, str]
 def valid_str(string, coding='utf-8'):
    """decode text according to its encoding type."""
    invalid_chars = [b'\xef\xbf\xbd']
    bstr = bytes(string, coding)
    for invalid_char in invalid_chars:
        bstr = bstr.replace(invalid_char, b'')
    ret = bstr.decode(encoding=coding, errors='ignore')
    return ret
 class TurboMindTisModel(BaseModel):
    """Model wrapper for TurboMind Triton Inference Server gRPC API.
    Args:
        path (str): The name of OpenAI's model.
        tis_addr (str): The address (ip:port format) of turbomind's
            triton inference server
        max_seq_len (int): The maximum allowed sequence length of a model.
            Note that the length of prompt + generated tokens shall not exceed
            this value. Defaults to 2048.
        meta_template (Dict, optional): The model's meta prompt
            template if needed, in case the requirement of injecting or
            wrapping of any meta instructions.
    """
    is_api: bool = True
    def __init__(
        self,
        path: str,
        tis_addr: str = '0.0.0.0:33337',
        max_seq_len: int = 2048,
        meta_template: Optional[Dict] = None,
    ):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         meta_template=meta_template)
        self.logger = get_logger()
        self.template_parser = LMTemplateParser(meta_template)
        self.eos_token_id = None
        if meta_template and 'eos_token_id' in meta_template:
            self.eos_token_id = meta_template['eos_token_id']
        self.tis_addr = tis_addr
        self.generation_kwargs = dict()
    def generate(
        self,
        inputs: List[str or PromptList],
        max_out_len: int = 512,
        temperature: float = 1.0,
    ) -> List[str]:
        """Generate results given a list of inputs.
        Args:
            inputs (List[str or PromptList]): A list of strings or PromptDicts.
                The PromptDict should be organized in OpenCompass'
                API format.
            max_out_len (int): The maximum length of the output.
            temperature (float): What sampling temperature to use,
                between 0 and 2. Higher values like 0.8 will make the output
                more random, while lower values like 0.2 will make it more
                focused and deterministic. Defaults to 0.7.
        Returns:
            List[str]: A list of generated strings.
        """
        with ThreadPoolExecutor() as executor:
            results = list(
                executor.map(self._generate, inputs,
                             [max_out_len] * len(inputs),
                             [temperature] * len(inputs)))
        return results
    def wait(self):
        """Wait till the next query can be sent.
        Applicable in both single-thread and multi-thread environments.
        """
        return self.token_bucket.get_token()
    def _generate(self, prompt: str or PromptList, max_out_len: int,
                  temperature: float) -> str:
        """Generate results given a list of inputs.
        Args:
            prompt (str or PromptList): A string or PromptDict.
                The PromptDict should be organized in OpenCompass'
                API format.
            max_out_len (int): The maximum length of the output.
            temperature (float): What sampling temperature to use,
                between 0 and 2. Higher values like 0.8 will make the output
                more random, while lower values like 0.2 will make it more
                focused and deterministic.
        Returns:
            str: The generated string.
        """
        assert type(
            prompt) is str, 'We only support string for TurboMind RPC API'
        from lmdeploy.serve.turbomind.chatbot import Chatbot
        chatbot = Chatbot(self.tis_addr,
                          temperature=temperature,
                          capability='completion',
                          top_k=1,
                          log_level=logging.ERROR)
        for status, text, n_token in chatbot.stream_infer(
                session_id=threading.currentThread().ident,
                prompt=prompt,
                request_output_len=max_out_len,
                sequence_start=True,
                sequence_end=True):
            continue
        response = valid_str(text)
        response = response.replace('<eoa>', '')
        return response