Support lmdeploy pytorch engine (#875)

* add lmdeploy pytorch model * fix * speed up encoding and decoding * fix * change tokenizer
2025-05-30 16:03:24 +08:00 · 2024-02-22 14:46:07 +08:00 · 2024-02-22 14:46:07 +08:00 · 32ba0b074e
commit 32ba0b074e
parent 6d04decab4
3 changed files with 227 additions and 0 deletions
--- a/configs/eval_internlm_chat_lmdeploy_pytorch.py
+++ b/configs/eval_internlm_chat_lmdeploy_pytorch.py
@ -0,0 +1,69 @@
+from mmengine.config import read_base
+from opencompass.models import LmdeployPytorchModel
+
+
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from .datasets.race.race_gen_69ee4f import race_datasets
+    from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
+    # and output the results in a choosen format
+    from .summarizers.medium import summarizer
+
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
+        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
+    ],
+    eos_token_id=103028)
+
+# config for internlm-chat-7b
+internlm_chat_7b = dict(
+    type=LmdeployPytorchModel,
+    abbr='internlm-chat-7b-pytorch',
+    path='internlm/internlm-chat-7b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=16),
+    gen_config=dict(top_k=1,
+                    top_p=0.8,
+                    temperature=1.0,
+                    max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=16,
+    concurrency=16,
+    meta_template=meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+)
+
+# config for internlm-chat-20b
+internlm_chat_20b = dict(
+    type=LmdeployPytorchModel,
+    abbr='internlm-chat-20b-pytorch',
+    path='internlm/internlm-chat-20b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=8),
+    gen_config=dict(top_k=1,
+                    top_p=0.8,
+                    temperature=1.0,
+                    max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    concurrency=8,
+    meta_template=meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+    )
+
+models = [internlm_chat_20b]
--- a/opencompass/models/init.py
+++ b/opencompass/models/init.py
@ -14,6 +14,7 @@ from .huggingface import HuggingFaceChatGLM3  # noqa: F401, F403
 from .intern_model import InternLM  # noqa: F401, F403
 from .lightllm_api import LightllmAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401, F403
+from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
 from .minimax_api import MiniMax  # noqa: F401
 from .mixtral import Mixtral  # noqa: F401
 from .modelscope import ModelScope, ModelScopeCausalLM  # noqa: F401, F403
--- a/opencompass/models/lmdeploy_pytorch.py
+++ b/opencompass/models/lmdeploy_pytorch.py
@ -0,0 +1,157 @@
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+from opencompass.models.base import BaseModel
+from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
+
+PromptType = Union[PromptList, str]
+
+
+def valid_str(string, coding='utf-8'):
+    """decode text according to its encoding type."""
+    invalid_chars = [b'\xef\xbf\xbd']
+    bstr = bytes(string, coding)
+    for invalid_char in invalid_chars:
+        bstr = bstr.replace(invalid_char, b'')
+    ret = bstr.decode(encoding=coding, errors='ignore')
+    return ret
+
+
+class LmdeployPytorchModel(BaseModel):
+    """Model wrapper for lmdeploy pytorch engine through python API.
+
+    Args:
+        path (str): path of the supported pytorch model.
+        max_seq_len (int): The maximum allowed sequence length of a model.
+            Note that the length of prompt + generated tokens shall not exceed
+            this value. Defaults to 2048.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        engine_config (Dict, optional): The engine config to set
+            arguments like session_len, max_batch_size for TurboMind.
+        gen_config (Dict, optional): Generation config to set
+                arguments like top_k, top_p, temperature.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 path: str,
+                 concurrency: int = 8,
+                 max_seq_len: int = 2048,
+                 meta_template: Optional[Dict] = None,
+                 engine_config: Optional[Dict] = None,
+                 gen_config: Optional[Dict] = None,
+                 end_str: Optional[str] = None):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         meta_template=meta_template)
+        from lmdeploy.pytorch import engine as tm
+
+        if engine_config is not None:
+            from lmdeploy.messages import PytorchEngineConfig
+            engine_config = PytorchEngineConfig(**engine_config)
+        if gen_config is not None:
+            from lmdeploy.messages import EngineGenerationConfig
+            gen_config = EngineGenerationConfig(**gen_config)
+
+        self.logger = get_logger()
+        tm_model = tm.Engine(path, engine_config)
+        self.tokenizer = tm_model.tokenizer
+        self.generators = [
+            tm_model.create_instance() for i in range(concurrency)
+        ]
+        self.generator_ids = [i + 1 for i in range(concurrency)]
+        self.gen_config = gen_config
+        self.end_str = end_str
+
+    def generate(
+        self,
+        inputs: List[str],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of prompts
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        assert isinstance(
+            inputs, List), f'List(str) is expected, but got {type(inputs)}'
+
+        # split inputs into batches
+        batch_size = len(self.generators)
+        batch_inputs = [
+            inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
+        ]
+
+        results = []
+        for batch_input in batch_inputs:
+            with ThreadPoolExecutor() as executor:
+                _results = list(
+                    executor.map(
+                        self._generate,
+                        self.generators[:len(batch_input)],
+                        self.generator_ids[:len(batch_input)],
+                        batch_input,
+                        [self.gen_config] * len(batch_input),
+                        [self.end_str] * len(batch_input),
+                    ))
+                results += _results
+        return results
+
+    def get_token_len(self, prompt: str) -> int:
+        input_ids = self.tokenizer.encode(prompt)
+        return len(input_ids)
+
+    def wait(self):
+        """Wait till the next query can be sent.
+
+        Applicable in both single-thread and multi-thread environments.
+        """
+        return self.token_bucket.get_token()
+
+    def _generate(self,
+                  generator,
+                  session_id,
+                  prompt: str or PromptList,
+                  gen_config=None,
+                  end_str: Optional[str] = None) -> str:
+        """Generate results given a list of inputs.
+
+        Args:
+            prompt (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            gen_config (EngineGenerationConfig, optional): Generation
+                config to set arguments like top_k, top_p, temperature.
+            end_str (str, optional): Whether to trim generated strings
+                with end_str if the model has special ending strings
+                that are not handled well.
+                Defaults to None.
+        Returns:
+            str: The generated string.
+        """
+        assert type(
+            prompt) is str, 'We only support string for TurboMind Python API'
+        input_ids = self.tokenizer.encode(prompt)
+        _, output_ids, _ = generator.infer(session_id,
+                                           input_ids,
+                                           gen_config=gen_config)
+        # stop engine
+        if hasattr(generator, 'end'):
+            generator.end(session_id)
+        # decode output
+        response_all = self.tokenizer.decode(output_ids)
+        # trim output
+        if end_str:
+            response_all = response_all.split(end_str)[0]
+        # remove invalid characters
+        response_all = valid_str(response_all)
+        return response_all