diff --git a/configs/eval_internlm_chat_7b_turbomind.py b/configs/eval_internlm_chat_7b_turbomind.py index aaddc4a5..dddc9898 100644 --- a/configs/eval_internlm_chat_7b_turbomind.py +++ b/configs/eval_internlm_chat_7b_turbomind.py @@ -3,30 +3,30 @@ from opencompass.models.turbomind import TurboMindModel with read_base(): # choose a list of datasets - from .datasets.SuperGLUE_CB.SuperGLUE_CB_gen import CB_datasets + from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # and output the results in a choosen format from .summarizers.medium import summarizer -datasets = [*CB_datasets] +datasets = [*gsm8k_datasets] -_meta_template = dict( +meta_template = dict( round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), ], -) + eos_token_id=103028) models = [ dict( type=TurboMindModel, - abbr='internlm-chat-7b-tb', + abbr='internlm-chat-7b-turbomind', path="internlm-chat-7b", - model_path='./workspace', + tis_addr='0.0.0.0:33337', max_out_len=100, max_seq_len=2048, batch_size=16, - meta_template=_meta_template, + meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md index 48623c22..33774357 100644 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ b/docs/en/advanced_guides/evaluation_turbomind.md @@ -38,12 +38,14 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch ``` -### Step-2: Verify the Converted Model +### Step-2: Launch Triton Inference Server ```shell -python -m lmdeploy.turbomind.chat ./workspace +bash ./workspace/service_docker_up.sh ``` +\*\*Note: \*\*In the implementation of turbomind, inference is "persistent". The "destroy" operation can lead to unexpected issues. Therefore, we temporarily use service interfaces for model evaluation. And we will integrate the Python API to OpenCompass when turbomind supports "destroy". + ### Step-3: Evaluate the Converted Model In the home folder of OpenCompass @@ -53,3 +55,5 @@ python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind ``` You are expected to get the evaluation results after the inference and evaluation. + +\*\*Note: \*\*In `eval_internlm_chat_7b_turbomind.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched. diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md index ee1d790c..969c7911 100644 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md @@ -38,12 +38,14 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch ``` -### 第二步: 验证转换后的模型 +### 第二步: 启动 TurboMind 的 Triton Inference Server ```shell -python -m lmdeploy.turbomind.chat ./workspace +bash ./workspace/service_docker_up.sh ``` +**注:** turbomind 的实现中,推理是“永驻”的。销毁操作会导致意想不到的问题发生。因此,我们暂时使用服务接口对接模型评测,待 turbomind 支持“销毁”之后,再提供 python API对接方式。 + ### 第三步: 评测转换后的模型 在 OpenCompass 项目目录执行: @@ -53,3 +55,5 @@ python run.py configs/eval_internlm_chat_7b_turbomind.py -w outputs/turbomind ``` 当模型完成推理和指标计算后,我们便可获得模型的评测结果。 + +**注:** `eval_internlm_chat_7b_turbomind.py` 中,配置的 triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:63337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。 diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index 8d87906e..e81399ef 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,10 +1,11 @@ -import os.path as osp -import random +import logging +import threading from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union -from opencompass.models.base import BaseModel -from opencompass.models.base_api import TokenBucket +from lmdeploy.serve.turbomind.chatbot import Chatbot + +from opencompass.models.base import BaseModel, LMTemplateParser from opencompass.utils.logging import get_logger from opencompass.utils.prompt import PromptList @@ -43,10 +44,8 @@ class TurboMindModel(BaseModel): def __init__( self, path: str, - model_path: str, + tis_addr: str = '0.0.0.0:33337', max_seq_len: int = 2048, - query_per_second: int = 1, - retry: int = 2, meta_template: Optional[Dict] = None, ): @@ -54,27 +53,17 @@ class TurboMindModel(BaseModel): max_seq_len=max_seq_len, meta_template=meta_template) self.logger = get_logger() - - from lmdeploy import turbomind as tm - from lmdeploy.model import MODELS as LMMODELS - from lmdeploy.turbomind.tokenizer import Tokenizer as LMTokenizer - - self.retry = retry - - tokenizer_model_path = osp.join(model_path, 'triton_models', - 'tokenizer') - self.tokenizer = LMTokenizer(tokenizer_model_path) - tm_model = tm.TurboMind(model_path, eos_id=self.tokenizer.eos_token_id) - self.model_name = tm_model.model_name - self.model = LMMODELS.get(self.model_name)() - self.generator = tm_model.create_instance() - self.token_bucket = TokenBucket(query_per_second) + self.template_parser = LMTemplateParser(meta_template) + self.eos_token_id = None + if meta_template and 'eos_token_id' in meta_template: + self.eos_token_id = meta_template['eos_token_id'] + self.tis_addr = tis_addr def generate( self, inputs: List[str or PromptList], max_out_len: int = 512, - temperature: float = 0.0, + temperature: float = 1.0, ) -> List[str]: """Generate results given a list of inputs. @@ -91,10 +80,10 @@ class TurboMindModel(BaseModel): Returns: List[str]: A list of generated strings. """ - prompts = inputs + with ThreadPoolExecutor() as executor: results = list( - executor.map(self._generate, prompts, + executor.map(self._generate, inputs, [max_out_len] * len(inputs), [temperature] * len(inputs))) return results @@ -106,12 +95,12 @@ class TurboMindModel(BaseModel): """ return self.token_bucket.get_token() - def _generate(self, input: str or PromptList, max_out_len: int, + def _generate(self, prompt: str or PromptList, max_out_len: int, temperature: float) -> str: """Generate results given a list of inputs. Args: - inputs (str or PromptList): A string or PromptDict. + prompt (str or PromptList): A string or PromptDict. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -123,39 +112,21 @@ class TurboMindModel(BaseModel): Returns: str: The generated string. """ - assert isinstance(input, (str, PromptList)) - assert type( - input - ) is str, 'We only support string for TurboMind Python API now' - - intput_token_ids = self.tokenizer.encode(input) - - for _ in range(self.retry): - self.wait() - session_id = random.randint(1, 100000) - nth_round = 0 - for outputs in self.generator.stream_infer( - session_id=session_id, - input_ids=[intput_token_ids], - stream_output=False, - request_output_len=max_out_len, - sequence_start=(nth_round == 0), - sequence_end=False, - step=0, - stop=False, - top_k=40, - top_p=0.8, - temperature=temperature, - repetition_penalty=1.0, - ignore_eos=False, - random_seed=random.getrandbits(64) - if nth_round == 0 else None): - pass - - output_token_ids, _ = outputs[0] - # decode output_token_ids - response = self.tokenizer.decode(output_token_ids) - response = valid_str(response) + prompt) is str, 'We only support string for TurboMind RPC API' + chatbot = Chatbot(self.tis_addr, + temperature=temperature, + capability='completion', + top_k=1, + log_level=logging.ERROR) + for status, text, n_token in chatbot.stream_infer( + session_id=threading.currentThread().ident, + prompt=prompt, + request_output_len=max_out_len, + sequence_start=True, + sequence_end=True): + continue + response = valid_str(text) + response = response.replace('', '') return response