[Feature] Add end_str for turbomind (#859)

* fix

* update

* fix internlm1

* fix docs

* remove sys
This commit is contained in:
RunningLeon 2024-02-01 22:31:14 +08:00 committed by GitHub
parent 5c6dc908cd
commit 4c87e777d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 57 additions and 8 deletions

View File

@ -23,6 +23,14 @@ internlm_meta_template = dict(round=[
], ],
eos_token_id=103028) eos_token_id=103028)
internlm2_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
],
eos_token_id=92542
)
# config for internlm-chat-7b # config for internlm-chat-7b
internlm_chat_7b = dict( internlm_chat_7b = dict(
type=TurboMindModel, type=TurboMindModel,
@ -41,6 +49,28 @@ internlm_chat_7b = dict(
concurrency=32, concurrency=32,
meta_template=internlm_meta_template, meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
# config for internlm-chat-7b
internlm2_chat_7b = dict(
type=TurboMindModel,
abbr='internlm2-chat-7b-turbomind',
path='internlm/internlm2-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=32,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=32,
concurrency=32,
meta_template=internlm2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>'
) )
# config for internlm-chat-20b # config for internlm-chat-20b
@ -61,6 +91,7 @@ internlm_chat_20b = dict(
concurrency=8, concurrency=8,
meta_template=internlm_meta_template, meta_template=internlm_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
) )
models = [internlm_chat_20b] models = [internlm_chat_20b]

View File

@ -56,6 +56,7 @@ internlm_20b = dict(
batch_size=8, batch_size=8,
concurrency=8, concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>'
) )
models = [internlm_20b] models = [internlm_20b]

View File

@ -56,6 +56,7 @@ internlm_20b = dict(
batch_size=8, batch_size=8,
concurrency=8, concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>'
) )
models = [internlm_20b] models = [internlm_20b]

View File

@ -34,6 +34,9 @@ class TurboMindModel(BaseModel):
arguments like session_len, max_batch_size for TurboMind. arguments like session_len, max_batch_size for TurboMind.
gen_config (Dict, optional): Generation config to set gen_config (Dict, optional): Generation config to set
arguments like top_k, top_p, temperature. arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
""" """
def __init__(self, def __init__(self,
@ -42,7 +45,8 @@ class TurboMindModel(BaseModel):
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
engine_config: Optional[Dict] = None, engine_config: Optional[Dict] = None,
gen_config: Optional[Dict] = None): gen_config: Optional[Dict] = None,
end_str: Optional[str] = None):
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
meta_template=meta_template) meta_template=meta_template)
@ -62,6 +66,7 @@ class TurboMindModel(BaseModel):
] ]
self.generator_ids = [i + 1 for i in range(concurrency)] self.generator_ids = [i + 1 for i in range(concurrency)]
self.gen_config = gen_config self.gen_config = gen_config
self.end_str = end_str
def generate( def generate(
self, self,
@ -90,11 +95,15 @@ class TurboMindModel(BaseModel):
for batch_input in batch_inputs: for batch_input in batch_inputs:
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
_results = list( _results = list(
executor.map(self._generate, executor.map(
self.generators[:len(batch_input)], self._generate,
self.generator_ids[:len(batch_input)], self.generators[:len(batch_input)],
batch_input, [max_out_len] * len(batch_input), self.generator_ids[:len(batch_input)],
[self.gen_config] * len(batch_input))) batch_input,
[max_out_len] * len(batch_input),
[self.gen_config] * len(batch_input),
[self.end_str] * len(batch_input),
))
results += _results results += _results
return results return results
@ -114,7 +123,8 @@ class TurboMindModel(BaseModel):
session_id, session_id,
prompt: str or PromptList, prompt: str or PromptList,
max_out_len: int, max_out_len: int,
gen_config=None) -> str: gen_config=None,
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
@ -124,7 +134,10 @@ class TurboMindModel(BaseModel):
max_out_len (int): The maximum length of the output. max_out_len (int): The maximum length of the output.
gen_config (EngineGenerationConfig, optional): Generation gen_config (EngineGenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature. config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns: Returns:
str: The generated string. str: The generated string.
""" """
@ -144,4 +157,7 @@ class TurboMindModel(BaseModel):
_, output_ids, _ = outputs _, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids) response = self.tokenizer.decode(output_ids)
response = valid_str(response) response = valid_str(response)
# used to trim
if end_str:
response = response.split(end_str)[0]
return response return response