[Feature] Update configs for evaluating chat models like qwen, baichuan, llama2 using turbomind backend (#721)

* add llama2 test

* fix

* test qwen chat-7b

* test w4

* add baichuan2

* update

* update

* update configs and docs

* update
This commit is contained in:
RunningLeon 2023-12-21 18:22:17 +08:00 committed by GitHub
parent fbb912ddf3
commit e34c552282
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 235 additions and 158 deletions

View File

@ -5,112 +5,198 @@ from opencompass.models.turbomind import TurboMindModel
with read_base(): with read_base():
# choose a list of datasets # choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets # from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets # from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets # from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
# from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets # from .datasets.race.race_gen_69ee4f import race_datasets
from .datasets.race.race_gen_69ee4f import race_datasets # from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format # and output the results in a choosen format
from .summarizers.medium import summarizer from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
internlm_meta_template = dict(round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
meta_template = dict( llama2_meta_template = dict(
round=[ round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True), dict(role='BOT', generate=True),
], ],
eos_token_id=103028) eos_token_id=2)
qwen_meta_template = dict(round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT',
begin='\n<|im_start|>assistant\n',
end='<|im_end|>',
generate=True)
])
baichuan2_meta_template = dict(round=[
dict(role='HUMAN', begin='<reserved_106>'),
dict(role='BOT', begin='<reserved_107>', generate=True)
])
# config for internlm-chat-7b # config for internlm-chat-7b
# models = [ internlm_chat_7b = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-chat-7b-turbomind',
# abbr='internlm-chat-7b-turbomind', path='./turbomind',
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=32,
# batch_size=32, concurrency=32,
# concurrency=32, meta_template=internlm_meta_template,
# meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# config for internlm-chat-7b-w4 model internlm_chat_7b_w4 = dict(
# models = [ type=TurboMindModel,
# dict( abbr='internlm-chat-7b-w4-turbomind',
# type=TurboMindModel, path='./turbomind',
# abbr='internlm-chat-7b-w4-turbomind', max_out_len=100,
# path="./turbomind", max_seq_len=2048,
# max_out_len=100, batch_size=32,
# max_seq_len=2048, concurrency=32,
# batch_size=32, meta_template=internlm_meta_template,
# concurrency=32, run_cfg=dict(num_gpus=1, num_procs=1),
# meta_template=meta_template, )
# run_cfg=dict(num_gpus=1, num_procs=1),
# )
# ]
# config for internlm-chat-7b-w4kv8 model # config for internlm-chat-7b-w4kv8 model
# models = [ internlm_chat_7b_w4kv8 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-chat-7b-w4kv8-turbomind',
# abbr='internlm-chat-7b-w4kv8-turbomind', path='./turbomind',
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=32,
# batch_size=32, concurrency=32,
# concurrency=32, meta_template=internlm_meta_template,
# meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# config for internlm-chat-20b # config for internlm-chat-20b
# models = [ internlm_chat_20b = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-chat-20b-turbomind',
# abbr='internlm-chat-20b-turbomind', path='./turbomind',
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=8,
# batch_size=8, concurrency=8,
# concurrency=8, meta_template=internlm_meta_template,
# meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# config for internlm-chat-20b-w4 model # config for internlm-chat-20b-w4 model
models = [ internlm_chat_20b_w4 = dict(
dict( type=TurboMindModel,
type=TurboMindModel, abbr='internlm-chat-20b-w4-turbomind',
abbr='internlm-chat-20b-w4-turbomind', path='./turbomind',
path="./turbomind", max_out_len=100,
max_out_len=100, max_seq_len=2048,
max_seq_len=2048, batch_size=16,
batch_size=16, concurrency=16,
concurrency=16, meta_template=internlm_meta_template,
meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=1, num_procs=1), )
)
]
# config for internlm-chat-20b-w4kv8 model # config for internlm-chat-20b-w4kv8 model
# models = [ internlm_chat_20b_w4kv8 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-chat-20b-w4kv8-turbomind',
# abbr='internlm-chat-20b-w4kv8-turbomind', path='./turbomind',
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=16,
# batch_size=16, concurrency=16,
# concurrency=16, meta_template=internlm_meta_template,
# meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ] # config for llama2-chat-7b
llama2_chat_7b = dict(
type=TurboMindModel,
abbr='llama2-chat-7b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=32,
meta_template=llama2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for llama2-chat-13b
llama2_chat_13b = dict(
type=TurboMindModel,
abbr='llama2-chat-13b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=16,
meta_template=llama2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for llama2-chat-70b
llama2_chat_70b = dict(
type=TurboMindModel,
abbr='llama2-chat-70b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
meta_template=llama2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for qwen-chat-7b
qwen_chat_7b = dict(
type=TurboMindModel,
abbr='qwen-chat-7b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=32,
meta_template=qwen_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for qwen-chat-7b
qwen_chat_14b = dict(
type=TurboMindModel,
abbr='qwen-chat-14b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=32,
meta_template=qwen_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
# config for baichuan2-chat-7b
baichuan2_chat_7b = dict(
type=TurboMindModel,
abbr='baichuan2-chat-7b-turbomind',
path='./turbomind',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=32,
meta_template=baichuan2_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
models = [internlm_chat_20b]

View File

@ -16,50 +16,43 @@ with read_base():
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# # config for internlm-7b model # # config for internlm-7b model
# models = [ internlm_7b = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-7b-turbomind',
# abbr='internlm-7b-turbomind', path="./turbomind",
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=32,
# batch_size=32, concurrency=32,
# concurrency=32, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# # config for internlm-7b-w4 model # # config for internlm-7b-w4 model
# models = [ internlm_7b_w4 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-7b-w4-turbomind',
# abbr='internlm-7b-w4-turbomind', path="./turbomind",
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=32,
# batch_size=32, concurrency=32,
# concurrency=32, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# # config for internlm-7b-w4kv8 model # # config for internlm-7b-w4kv8 model
# models = [ internlm_7b_w4kv8 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-7b-w4kv8-turbomind',
# abbr='internlm-7b-w4kv8-turbomind', path="./turbomind",
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=32,
# batch_size=32, concurrency=32,
# concurrency=32, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# config for internlm-20b model # config for internlm-20b model
models = [ internlm_20b = dict(
dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm-20b-turbomind', abbr='internlm-20b-turbomind',
path="./turbomind", path="./turbomind",
@ -69,33 +62,30 @@ models = [
concurrency=8, concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
) )
]
# config for internlm-20b-w4 model # config for internlm-20b-w4 model
# models = [ internlm_20b_w4 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-20b-w4-turbomind',
# abbr='internlm-20b-w4-turbomind', path="./turbomind",
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=16,
# batch_size=16, concurrency=16,
# concurrency=16, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ]
# config for internlm-20b-w4kv8 model # config for internlm-20b-w4kv8 model
# models = [ internlm_20b_w4kv8 = dict(
# dict( type=TurboMindModel,
# type=TurboMindModel, abbr='internlm-20b-w4kv8-turbomind',
# abbr='internlm-20b-w4kv8-turbomind', path="./turbomind",
# path="./turbomind", max_out_len=100,
# max_out_len=100, max_seq_len=2048,
# max_seq_len=2048, batch_size=16,
# batch_size=16, concurrency=16,
# concurrency=16, run_cfg=dict(num_gpus=1, num_procs=1),
# run_cfg=dict(num_gpus=1, num_procs=1), )
# )
# ] models = [internlm_20b]

View File

@ -55,8 +55,9 @@ You are expected to get the evaluation results after the inference and evaluatio
**Note**: **Note**:
- If you evaluate theInternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` - If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by commenting out the configuration for the 20B model and enabling the configuration for the 7B model. - If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.
- If you want to evaluate other chat models like Llama2, QWen-7B, Baichuan2-7B, you could change to the setting of `models` in `eval_internlm_chat_turbomind.py`.
### Evaluation with Turbomind gPRC API (optional) ### Evaluation with Turbomind gPRC API (optional)
@ -76,4 +77,4 @@ python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/in
- If the InternLM Chat model is requested to be evaluated, please use config file `eval_internlm_chat_turbomind_tis.py` - If the InternLM Chat model is requested to be evaluated, please use config file `eval_internlm_chat_turbomind_tis.py`
- In `eval_internlm_turbomind_tis.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched. - In `eval_internlm_turbomind_tis.py`, the configured Triton Inference Server (TIS) address is `tis_addr='0.0.0.0:33337'`. Please modify `tis_addr` to the IP address of the machine where the server is launched.
- If evaluating the InternLM 7B model, please modify the config file, commenting out the configuration for the 20B model and enabling the configuration for the 7B model - If evaluating the InternLM 7B model, please modify the `models` configuration in `eval_internlm_xxx_turbomind_tis.py`.

View File

@ -52,7 +52,8 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2
**注:** **注:**
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` - 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py`
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。把 20B 模型的配置注释掉,打开 7B 模型的配置。 - 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。
- 如果评测其他模型如 Llama2, QWen-7B, Baichuan2-7B, 请修改`eval_internlm_chat_turbomind.py`中`models`字段 。
### 通过 TurboMind gPRC API 评测(可选) ### 通过 TurboMind gPRC API 评测(可选)
@ -72,5 +73,5 @@ python run.py configs/eval_internlm_turbomind_tis.py -w outputs/turbomind-tis/in
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind_tis.py` - 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind_tis.py`
- 在配置文件中triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:33337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。 - 在配置文件中triton inference server(TIS) 地址是 `tis_addr='0.0.0.0:33337'`。请把配置中的`tis_addr`修改为server所在机器的ip地址。
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_xxx_turbomind_tis.py`。把其中 20B 模型的配置注释掉,打开 7B 模型的配置 - 如果评测 InternLM 7B 模型,请修改 `eval_internlm_xxx_turbomind_tis.py`中`models`字段
``` ```

View File

@ -126,7 +126,6 @@ class TurboMindModel(BaseModel):
assert type( assert type(
prompt) is str, 'We only support string for TurboMind Python API' prompt) is str, 'We only support string for TurboMind Python API'
prompt = '<BOS>' + prompt
input_ids = self.tokenizer.encode(prompt) input_ids = self.tokenizer.encode(prompt)
for outputs in generator.stream_infer(session_id=session_id, for outputs in generator.stream_infer(session_id=session_id,