mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge branch 'open-compass:main' into main
This commit is contained in:
commit
346c06015a
26
.github/scripts/eval_regression_base.py
vendored
26
.github/scripts/eval_regression_base.py
vendored
@ -8,15 +8,17 @@ with read_base():
|
||||
race_datasets # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
|
||||
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
|
||||
models as hf_deepseek_v2_lite_model # noqa: F401, E501
|
||||
# read hf models - chat models
|
||||
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
|
||||
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_2b import \
|
||||
models as hf_gemma_2b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_7b import \
|
||||
models as hf_gemma_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_2b import \
|
||||
models as hf_gemma2_2b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_9b import \
|
||||
models as hf_gemma2_9b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
||||
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||
@ -31,16 +33,28 @@ with read_base():
|
||||
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||
models as hf_llama2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||
models as hf_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||
models as lmdeploy_llama3_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
|
||||
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
|
||||
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
|
||||
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
|
||||
models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
|
||||
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
|
||||
models as hf_qwen2_0_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
|
||||
models as hf_qwen2_1_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_7b import \
|
||||
models as hf_qwen2_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
|
||||
models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
|
||||
|
34
.github/scripts/eval_regression_chat.py
vendored
34
.github/scripts/eval_regression_chat.py
vendored
@ -13,20 +13,32 @@ with read_base():
|
||||
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
|
||||
models as hf_glm4_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
|
||||
models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
|
||||
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
||||
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
|
||||
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
|
||||
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
||||
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
|
||||
models as hf_gemma_2b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
|
||||
models as hf_gemma_7b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
||||
models as hf_gemma2_2b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
|
||||
models as hf_gemma2_9b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
|
||||
models as vllm_gemma_7b_it_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
||||
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
||||
models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
|
||||
models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
|
||||
@ -37,14 +49,20 @@ with read_base():
|
||||
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
||||
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
|
||||
models as hf_llama3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
||||
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
|
||||
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
|
||||
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
|
||||
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
|
||||
models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
|
||||
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
|
||||
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
|
||||
@ -57,6 +75,10 @@ with read_base():
|
||||
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
|
||||
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
|
||||
models as hf_qwen2_1_5b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
|
||||
models as hf_qwen2_7b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
|
||||
models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
|
||||
|
40
.github/scripts/oc_score_assert.py
vendored
40
.github/scripts/oc_score_assert.py
vendored
@ -8,29 +8,33 @@ output_path = 'regression_result_daily'
|
||||
|
||||
chat_model_list = [
|
||||
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
|
||||
'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
|
||||
'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
|
||||
'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
|
||||
'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
|
||||
'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
|
||||
'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
|
||||
'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
|
||||
'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
|
||||
'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
|
||||
'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
|
||||
'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
|
||||
'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
|
||||
'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
|
||||
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
|
||||
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
|
||||
'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
|
||||
'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
|
||||
'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
|
||||
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
|
||||
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
|
||||
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
|
||||
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
|
||||
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
|
||||
'lmdeploy-api-test'
|
||||
]
|
||||
base_model_list = [
|
||||
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
|
||||
'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
|
||||
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
|
||||
'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
|
||||
'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
|
||||
'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
|
||||
'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
|
||||
'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
|
||||
'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
|
||||
'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
|
||||
'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
|
||||
'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
|
||||
'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
|
||||
'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
|
||||
'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
|
||||
'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
|
||||
'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
|
||||
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
|
||||
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
|
||||
]
|
||||
dataset_list = ['gsm8k', 'race-middle', 'race-high']
|
||||
|
||||
|
114
.github/scripts/oc_score_baseline.yaml
vendored
114
.github/scripts/oc_score_baseline.yaml
vendored
@ -8,6 +8,16 @@ glm-4-9b-chat-hf:
|
||||
race-middle: 88
|
||||
race-high: 88
|
||||
|
||||
glm-4-9b-chat-turbomind:
|
||||
gsm8k: 69
|
||||
race-middle: 82
|
||||
race-high: 77
|
||||
|
||||
glm-4-9b-chat-vllm:
|
||||
gsm8k: 73
|
||||
race-middle: 87
|
||||
race-high: 87
|
||||
|
||||
deepseek-7b-chat-hf:
|
||||
gsm8k: 60
|
||||
race-middle: 74
|
||||
@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf:
|
||||
race-middle: 62
|
||||
race-high: 70
|
||||
|
||||
deepseek-v2-lite-chat-hf:
|
||||
gsm8k: 59
|
||||
race-middle: 82
|
||||
race-high: 79
|
||||
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k: 63
|
||||
race-middle: 74
|
||||
@ -33,23 +48,48 @@ gemma-7b-it-hf:
|
||||
race-middle: 74
|
||||
race-high: 71
|
||||
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k: 38
|
||||
race-middle: 75
|
||||
race-high: 70
|
||||
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k: 62
|
||||
race-middle: 75
|
||||
race-high: 67
|
||||
|
||||
gemma2-9b-it-hf:
|
||||
gsm8k: 80
|
||||
race-middle: 89
|
||||
race-high: 85
|
||||
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k: 86
|
||||
race-middle: 92
|
||||
race-high: 93
|
||||
|
||||
internlm2_5-20b-chat-hf:
|
||||
gsm8k: 91
|
||||
race-middle: 95
|
||||
race-high: 91
|
||||
|
||||
internlm2_5-7b-chat-turbomind:
|
||||
gsm8k: 87
|
||||
race-middle: 92
|
||||
race-high: 93
|
||||
|
||||
internlm2_5-20b-chat-turbomind:
|
||||
gsm8k: 91
|
||||
race-middle: 95
|
||||
race-high: 91
|
||||
|
||||
internlm2-chat-1.8b-turbomind:
|
||||
gsm8k: 40
|
||||
race-middle: 82
|
||||
race-high: 83
|
||||
|
||||
internlm2-chat-1.8b-sft-turbomind:
|
||||
gsm8k: 32
|
||||
gsm8k: 34
|
||||
race-middle: 81
|
||||
race-high: 83
|
||||
|
||||
@ -68,11 +108,21 @@ internlm2-chat-7b-vllm:
|
||||
race-middle: 90
|
||||
race-high: 91
|
||||
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k: 82
|
||||
race-middle: 82
|
||||
race-high: 88
|
||||
|
||||
llama-3-8b-instruct-hf:
|
||||
gsm8k: 77
|
||||
race-middle: 85
|
||||
race-high: 87
|
||||
|
||||
llama-3_1-8b-instruct-turbomind:
|
||||
gsm8k: 79
|
||||
race-middle: 82
|
||||
race-high: 88
|
||||
|
||||
llama-3-8b-instruct-turbomind:
|
||||
gsm8k: 77
|
||||
race-middle: 85
|
||||
@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf:
|
||||
race-middle: 82
|
||||
race-high: 78
|
||||
|
||||
mistral-7b-instruct-v0.3-hf:
|
||||
gsm8k: 53
|
||||
race-middle: 80
|
||||
race-high: 78
|
||||
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k: 49
|
||||
race-middle: 81
|
||||
@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf:
|
||||
race-middle: 55
|
||||
race-high: 50
|
||||
|
||||
qwen2-1.5b-instruct-hf:
|
||||
gsm8k: 63
|
||||
race-middle: 77
|
||||
race-high: 86
|
||||
|
||||
qwen2-1.5b-instruct-turbomind:
|
||||
gsm8k: 60
|
||||
race-middle: 77
|
||||
@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind:
|
||||
race-middle: 87
|
||||
race-high: 89
|
||||
|
||||
qwen2-7b-instruct-hf:
|
||||
gsm8k: 85
|
||||
race-middle: 87
|
||||
race-high: 91
|
||||
|
||||
qwen1.5-0.5b-chat-vllm:
|
||||
gsm8k: 5
|
||||
race-middle: 57
|
||||
@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf:
|
||||
race-middle: 35
|
||||
race-high: 23
|
||||
|
||||
deepseek-v2-lite-hf:
|
||||
gsm8k: 37
|
||||
race-middle: 56
|
||||
race-high: 62
|
||||
|
||||
deepseek-7b-base-turbomind:
|
||||
gsm8k: 21
|
||||
race-middle: 42
|
||||
@ -173,8 +243,18 @@ gemma-7b-hf:
|
||||
race-middle: 59
|
||||
race-high: 66
|
||||
|
||||
gemma2-2b-hf:
|
||||
gsm8k: 33
|
||||
race-middle: 56
|
||||
race-high: 58
|
||||
|
||||
gemma2-9b-hf:
|
||||
gsm8k: 70
|
||||
race-middle: 82
|
||||
race-high: 84
|
||||
|
||||
internlm2_5-7b-hf:
|
||||
gsm8k: 46
|
||||
gsm8k: 47
|
||||
race-middle: 92
|
||||
race-high: 91
|
||||
|
||||
@ -208,6 +288,21 @@ internlm2-base-7b-turbomind:
|
||||
race-middle: 75
|
||||
race-high: 81
|
||||
|
||||
llama-2-7b-hf:
|
||||
gsm8k: 17
|
||||
race-middle: 32
|
||||
race-high: 38
|
||||
|
||||
llama-3-8b-hf:
|
||||
gsm8k: 48
|
||||
race-middle: 64
|
||||
race-high: 70
|
||||
|
||||
llama-3.1-8b-turbomind:
|
||||
gsm8k: 57
|
||||
race-middle: 67
|
||||
race-high: 75
|
||||
|
||||
llama-3-8b-turbomind:
|
||||
gsm8k: 52
|
||||
race-middle: 63
|
||||
@ -218,6 +313,11 @@ mistral-7b-v0.2-hf:
|
||||
race-middle: 42
|
||||
race-high: 60
|
||||
|
||||
mistral-7b-v0.3-hf:
|
||||
gsm8k: 43
|
||||
race-middle: 42
|
||||
race-high: 60
|
||||
|
||||
mistral-7b-v0.2-vllm:
|
||||
gsm8k: 45
|
||||
race-middle: 42
|
||||
@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf:
|
||||
race-middle: 78
|
||||
race-high: 90
|
||||
|
||||
qwen2-1.5b-hf:
|
||||
gsm8k: 58
|
||||
race-middle: 65
|
||||
race-high: 78
|
||||
|
||||
qwen2-0.5b-hf:
|
||||
gsm8k: 35
|
||||
race-middle: 52
|
||||
race-high: 48
|
||||
|
||||
qwen2-7b-hf:
|
||||
gsm8k: 82
|
||||
race-middle: 88
|
||||
race-high: 89
|
||||
|
||||
qwen2-1.5b-turbomind:
|
||||
gsm8k: 57
|
||||
race-middle: 64
|
||||
|
75
.github/workflows/daily-run-test.yml
vendored
75
.github/workflows/daily-run-test.yml
vendored
@ -14,9 +14,14 @@ env:
|
||||
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
|
||||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
|
||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
||||
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
|
||||
HF_DATASETS_OFFLINE: 1
|
||||
HF_EVALUATE_OFFLINE: 1
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
VLLM_USE_MODELSCOPE: false
|
||||
LMDEPLOY_USE_MODELSCOPE: false
|
||||
HF_HUB_OFFLINE: 1
|
||||
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
|
||||
|
||||
@ -43,7 +48,11 @@ jobs:
|
||||
|
||||
daily_run_test:
|
||||
needs: build-pypi
|
||||
runs-on: self-hosted
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
cuda_env: [dsw_cu11, dsw_cu12]
|
||||
runs-on: ${{ matrix.cuda_env }}
|
||||
environment: 'prod'
|
||||
timeout-minutes: 420 #7hours
|
||||
steps:
|
||||
@ -53,22 +62,38 @@ jobs:
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: my-artifact-${{ github.run_id }}
|
||||
- name: Prepare - create conda env and install torch
|
||||
- name: Prepare - create conda env and install torch - cu11
|
||||
if: ${{matrix.cuda_env == 'dsw_cu11'}}
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip install opencompass*.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
|
||||
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip uninstall torch torchvision torchaudio -y
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
- name: Prepare - create conda env and install torch - cu12
|
||||
if: ${{matrix.cuda_env == 'dsw_cu12'}}
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir
|
||||
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
pip uninstall torch torchvision torchaudio -y
|
||||
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
conda info --envs
|
||||
pip list
|
||||
- name: Prepare - prepare data and hf model
|
||||
run: |
|
||||
ln -s ${{env.DATEASET_CACHE_PATH}} data
|
||||
@ -77,45 +102,45 @@ jobs:
|
||||
- name: Run chat model test
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
|
||||
python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
|
||||
opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run base model test
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
|
||||
opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Run command testcase
|
||||
run: |
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
export from_tf=TRUE
|
||||
python tools/list_configs.py internlm2_5 mmlu
|
||||
python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||
opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||
opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||
opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||
opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse
|
||||
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
|
||||
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
- name: Remove Conda Env
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf regression_result_daily
|
||||
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
|
||||
conda env remove -y --name ${{env.CONDA_ENV}}
|
||||
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
|
||||
conda info --envs
|
||||
|
||||
notify_to_feishu:
|
||||
|
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
python-version: '3.10'
|
||||
- name: Install pre-commit hook
|
||||
run: |
|
||||
pip install pre-commit mmengine
|
||||
pip install pre-commit==3.8.0 mmengine
|
||||
pre-commit install
|
||||
- name: Linting
|
||||
run: pre-commit run --all-files
|
||||
|
2
.github/workflows/pr-run-test.yml
vendored
2
.github/workflows/pr-run-test.yml
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
conda info --envs
|
||||
rm -rf regression_result
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
|
||||
opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
|
||||
- name: Get result
|
||||
run: |
|
||||
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||
|
@ -594,7 +594,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
|
||||
## 🔜 Roadmap
|
||||
|
||||
- [x] Subjective Evaluation
|
||||
- [x] Release CompassAreana
|
||||
- [x] Release CompassAreana.
|
||||
- [x] Subjective evaluation.
|
||||
- [x] Long-context
|
||||
- [x] Long-context evaluation with extensive datasets.
|
||||
@ -603,10 +603,10 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
|
||||
- [ ] Coding evaluation leaderboard.
|
||||
- [x] Non-python language evaluation service.
|
||||
- [x] Agent
|
||||
- [ ] Support various agenet framework.
|
||||
- [ ] Support various agent frameworks.
|
||||
- [x] Evaluation of tool use of the LLMs.
|
||||
- [x] Robustness
|
||||
- [x] Support various attack method
|
||||
- [x] Support various attack methods.
|
||||
|
||||
## 👷♂️ Contributing
|
||||
|
||||
|
38
configs/api_examples/eval_api_bailing.py
Normal file
38
configs/api_examples/eval_api_bailing.py
Normal file
@ -0,0 +1,38 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models import BailingAPI
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.runners.local_api import LocalAPIRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
|
||||
from opencompass.configs.summarizers.medium import summarizer
|
||||
|
||||
datasets = [
|
||||
*ceval_datasets,
|
||||
]
|
||||
|
||||
models = [
|
||||
dict(
|
||||
path='Bailing-Lite-0830',
|
||||
token='xxxxxx', # set your key here or in environment variable BAILING_API_KEY
|
||||
url='https://bailingchat.alipay.com/chat/completions',
|
||||
type=BailingAPI,
|
||||
generation_kwargs={},
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
),
|
||||
]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=LocalAPIRunner,
|
||||
max_num_workers=2,
|
||||
concurrent_users=2,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
work_dir = 'outputs/api_bailing/'
|
34
configs/datasets/dingo/dingo_gen.py
Normal file
34
configs/datasets/dingo/dingo_gen.py
Normal file
@ -0,0 +1,34 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import DingoDataset, DingoEvaluator
|
||||
|
||||
|
||||
dingo_paths = [
|
||||
'./data/dingo/en_192.csv',
|
||||
'./data/dingo/zh_170.csv',
|
||||
]
|
||||
|
||||
dingo_datasets = []
|
||||
for path in dingo_paths:
|
||||
dingo_reader_cfg = dict(input_columns='input', output_column=None)
|
||||
dingo_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
|
||||
|
||||
dingo_datasets.append(
|
||||
dict(
|
||||
abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
|
||||
type=DingoDataset,
|
||||
path=path,
|
||||
reader_cfg=dingo_reader_cfg,
|
||||
infer_cfg=dingo_infer_cfg,
|
||||
eval_cfg=dingo_eval_cfg,
|
||||
))
|
||||
|
||||
datasets = dingo_datasets
|
@ -15,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/followbench/converted_data'
|
||||
|
||||
followbench_llmeval_dataset = []
|
||||
followbench_llmeval_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
followbench_llmeval_dataset.append(
|
||||
followbench_llmeval_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=FollowBenchDataset,
|
||||
|
73
configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
Normal file
73
configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
Normal file
@ -0,0 +1,73 @@
|
||||
import copy
|
||||
|
||||
from opencompass.datasets import WikiBenchDataset
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
|
||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn': [
|
||||
dict(role='HUMAN',
|
||||
prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
|
||||
),
|
||||
dict(role='BOT', prompt='回答: B'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
|
||||
),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
|
||||
),
|
||||
dict(role='BOT', prompt='回答: B'),
|
||||
dict(role='HUMAN',
|
||||
prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(role='HUMAN', prompt='问题: {question}'),
|
||||
dict(role='BOT', prompt='回答: {answer}'),
|
||||
]
|
||||
}
|
||||
|
||||
wikibench_sets = {
|
||||
'wiki': ['single_choice_cn'],
|
||||
}
|
||||
|
||||
do_circular = True
|
||||
|
||||
wikibench_datasets = []
|
||||
|
||||
for _split in list(wikibench_sets.keys()):
|
||||
for _name in wikibench_sets[_split]:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = copy.deepcopy(single_choice_prompts[_name])
|
||||
one_template_round[-1]['prompt'] = one_template_round[-1][
|
||||
'prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
wikibench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
wikibench_eval_cfg = dict(evaluator=dict(
|
||||
type=CircularEvaluator if do_circular else AccEvaluator), )
|
||||
wikibench_datasets.append(
|
||||
dict(
|
||||
type=WikiBenchDataset,
|
||||
path=f'./data/WikiBench/{_name}.jsonl',
|
||||
name='circular_' + _name if do_circular else _name,
|
||||
abbr='wikibench-' + _split + '-' + _name +
|
||||
'circular' if do_circular else '',
|
||||
reader_cfg=dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
),
|
||||
infer_cfg=wikibench_infer_cfg,
|
||||
eval_cfg=wikibench_eval_cfg,
|
||||
))
|
188
configs/eval_corebench_2409_base_objective.py
Normal file
188
configs/eval_corebench_2409_base_objective.py
Normal file
@ -0,0 +1,188 @@
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
## Core Set
|
||||
# ## Examination
|
||||
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
|
||||
mmlu_pro_datasets
|
||||
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
|
||||
cmmlu_datasets
|
||||
# ## Reasoning
|
||||
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
|
||||
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
|
||||
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
|
||||
|
||||
# ## Math
|
||||
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
|
||||
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
|
||||
mathbench_datasets
|
||||
|
||||
# ## Scientific
|
||||
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
|
||||
gpqa_datasets
|
||||
|
||||
# ## Coding
|
||||
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
|
||||
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets
|
||||
# TODO: Add LiveCodeBench
|
||||
|
||||
# ## Instruction Following
|
||||
# from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
|
||||
|
||||
# Summarizer
|
||||
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
|
||||
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
|
||||
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
|
||||
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
||||
mathbench_2024_summary_groups
|
||||
|
||||
# Model List
|
||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model
|
||||
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
|
||||
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
|
||||
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
|
||||
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
|
||||
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
# with read_base():
|
||||
|
||||
core_summary_groups = [
|
||||
{
|
||||
'name': 'core_average',
|
||||
'subsets': [
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['cmmlu', 'accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['hellaswag', 'accuracy'],
|
||||
['drop', 'accuracy'],
|
||||
['math', 'accuracy'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['mathbench-t (average)', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
['mathbench-t (average)', 'naive_average']
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['cmmlu', 'accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['hellaswag', 'accuracy'],
|
||||
['drop', 'accuracy'],
|
||||
['math', 'accuracy'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['mathbench-t (average)', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
'mathbench-a (average)',
|
||||
'mathbench-t (average)'
|
||||
'',
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu-stem', 'accuracy'],
|
||||
['mmlu-social-science', 'accuracy'],
|
||||
['mmlu-humanities', 'accuracy'],
|
||||
['mmlu-other', 'accuracy'],
|
||||
|
||||
'',
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['mmlu_pro_math','accuracy'],
|
||||
['mmlu_pro_physics', 'accuracy'],
|
||||
['mmlu_pro_chemistry', 'accuracy'],
|
||||
['mmlu_pro_law', 'accuracy'],
|
||||
['mmlu_pro_engineering', 'accuracy'],
|
||||
['mmlu_pro_other', 'accuracy'],
|
||||
['mmlu_pro_economics', 'accuracy'],
|
||||
['mmlu_pro_health', 'accuracy'],
|
||||
['mmlu_pro_psychology', 'accuracy'],
|
||||
['mmlu_pro_business', 'accuracy'],
|
||||
['mmlu_pro_biology', 'accuracy'],
|
||||
['mmlu_pro_philosophy', 'accuracy'],
|
||||
['mmlu_pro_computer_science','accuracy'],
|
||||
['mmlu_pro_history', 'accuracy'],
|
||||
'',
|
||||
['cmmlu', 'accuracy'],
|
||||
['cmmlu-stem', 'accuracy'],
|
||||
['cmmlu-social-science', 'accuracy'],
|
||||
['cmmlu-humanities', 'accuracy'],
|
||||
['cmmlu-other', 'accuracy'],
|
||||
['cmmlu-china-specific', 'accuracy'],
|
||||
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(
|
||||
type=NumWorkerPartitioner,
|
||||
num_worker=8
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
# eval with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLEvalTask)),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
base_exp_dir = 'outputs/corebench_2409_objective/'
|
||||
work_dir = osp.join(base_exp_dir, 'base_objective')
|
220
configs/eval_corebench_2409_chat_objective.py
Normal file
220
configs/eval_corebench_2409_chat_objective.py
Normal file
@ -0,0 +1,220 @@
|
||||
from mmengine.config import read_base
|
||||
import os.path as osp
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
## Core Set
|
||||
# ## Examination
|
||||
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
|
||||
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
|
||||
|
||||
# ## Reasoning
|
||||
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
|
||||
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
|
||||
hellaswag_datasets
|
||||
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets
|
||||
|
||||
# ## Math
|
||||
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
||||
gsm8k_datasets
|
||||
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets
|
||||
|
||||
# ## Scientific
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
|
||||
|
||||
# ## Coding
|
||||
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets
|
||||
# TODO: Add LiveCodeBench
|
||||
|
||||
# ## Instruction Following
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
|
||||
|
||||
# Summarizer
|
||||
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
|
||||
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
|
||||
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
|
||||
|
||||
|
||||
# Model List
|
||||
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
|
||||
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
|
||||
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
|
||||
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
|
||||
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
# with read_base():
|
||||
|
||||
core_summary_groups = [
|
||||
{
|
||||
'name': 'core_average',
|
||||
'subsets': [
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['cmmlu', 'accuracy'],
|
||||
['bbh', 'score'],
|
||||
['math', 'accuracy'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['drop', 'accuracy'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['hellaswag', 'accuracy'],
|
||||
['mathbench-t (average)', 'naive_average']
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['core_average', 'naive_average'],
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['cmmlu', 'accuracy'],
|
||||
['bbh', 'score'],
|
||||
['math', 'accuracy'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['drop', 'accuracy'],
|
||||
['sanitized_mbpp', 'score'],
|
||||
['gsm8k', 'accuracy'],
|
||||
['hellaswag', 'accuracy'],
|
||||
'mathbench-a (average)',
|
||||
'mathbench-t (average)'
|
||||
'',
|
||||
|
||||
['mmlu', 'accuracy'],
|
||||
['mmlu-stem', 'accuracy'],
|
||||
['mmlu-social-science', 'accuracy'],
|
||||
['mmlu-humanities', 'accuracy'],
|
||||
['mmlu-other', 'accuracy'],
|
||||
|
||||
'',
|
||||
['mmlu_pro', 'accuracy'],
|
||||
['mmlu_pro_math','accuracy'],
|
||||
['mmlu_pro_physics', 'accuracy'],
|
||||
['mmlu_pro_chemistry', 'accuracy'],
|
||||
['mmlu_pro_law', 'accuracy'],
|
||||
['mmlu_pro_engineering', 'accuracy'],
|
||||
['mmlu_pro_other', 'accuracy'],
|
||||
['mmlu_pro_economics', 'accuracy'],
|
||||
['mmlu_pro_health', 'accuracy'],
|
||||
['mmlu_pro_psychology', 'accuracy'],
|
||||
['mmlu_pro_business', 'accuracy'],
|
||||
['mmlu_pro_biology', 'accuracy'],
|
||||
['mmlu_pro_philosophy', 'accuracy'],
|
||||
['mmlu_pro_computer_science','accuracy'],
|
||||
['mmlu_pro_history', 'accuracy'],
|
||||
'',
|
||||
['cmmlu', 'accuracy'],
|
||||
['cmmlu-stem', 'accuracy'],
|
||||
['cmmlu-social-science', 'accuracy'],
|
||||
['cmmlu-humanities', 'accuracy'],
|
||||
['cmmlu-other', 'accuracy'],
|
||||
['cmmlu-china-specific', 'accuracy'],
|
||||
'',
|
||||
['bbh', 'extract_rate'],
|
||||
['math', 'extract_rate'],
|
||||
# ['openai_humaneval', 'extract_rate'],
|
||||
['GPQA_diamond', 'extract_rate'],
|
||||
# ['IFEval', 'extract_rate'],
|
||||
'',
|
||||
['mmlu', 'extract_rate'],
|
||||
['mmlu-stem', 'extract_rate'],
|
||||
['mmlu-social-science', 'extract_rate'],
|
||||
['mmlu-humanities', 'extract_rate'],
|
||||
['mmlu-other', 'extract_rate'],
|
||||
'',
|
||||
['mmlu_pro', 'extract_rate'],
|
||||
['mmlu_pro_math', 'extract_rate'],
|
||||
['mmlu_pro_physics', 'extract_rate'],
|
||||
['mmlu_pro_chemistry', 'extract_rate'],
|
||||
['mmlu_pro_law', 'extract_rate'],
|
||||
['mmlu_pro_engineering', 'extract_rate'],
|
||||
['mmlu_pro_other', 'extract_rate'],
|
||||
['mmlu_pro_economics', 'extract_rate'],
|
||||
['mmlu_pro_health', 'extract_rate'],
|
||||
['mmlu_pro_psychology', 'extract_rate'],
|
||||
['mmlu_pro_business', 'extract_rate'],
|
||||
['mmlu_pro_biology', 'extract_rate'],
|
||||
['mmlu_pro_philosophy', 'extract_rate'],
|
||||
['mmlu_pro_computer_science', 'extract_rate'],
|
||||
['mmlu_pro_history', 'extract_rate'],
|
||||
'',
|
||||
['cmmlu', 'extract_rate'],
|
||||
['cmmlu-stem', 'extract_rate'],
|
||||
['cmmlu-social-science', 'extract_rate'],
|
||||
['cmmlu-humanities', 'extract_rate'],
|
||||
['cmmlu-other', 'extract_rate'],
|
||||
['cmmlu-china-specific', 'extract_rate'],
|
||||
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(
|
||||
type=NumWorkerPartitioner,
|
||||
num_worker=8
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
# eval with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLEvalTask)),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
base_exp_dir = 'outputs/corebench_2409_objective/'
|
||||
work_dir = osp.join(base_exp_dir, 'chat_objective')
|
138
configs/eval_corebench_2409_longcontext.py
Normal file
138
configs/eval_corebench_2409_longcontext.py
Normal file
@ -0,0 +1,138 @@
|
||||
import os.path as osp
|
||||
from copy import deepcopy
|
||||
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import (HuggingFacewithChatTemplate,
|
||||
TurboMindModelwithChatTemplate)
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.runners import DLCRunner, LocalRunner
|
||||
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.longbench.longbench import \
|
||||
longbench_datasets
|
||||
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
|
||||
needlebench_datasets as needlebench_8k_datasets
|
||||
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
|
||||
needlebench_datasets as needlebench_32k_datasets
|
||||
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
|
||||
needlebench_datasets as needlebench_128k_datasets
|
||||
from opencompass.configs.datasets.ruler.ruler_8k_gen import \
|
||||
ruler_datasets as ruler_8k_datasets
|
||||
from opencompass.configs.datasets.ruler.ruler_32k_gen import \
|
||||
ruler_datasets as ruler_32k_datasets
|
||||
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
|
||||
ruler_datasets as ruler_128k_datasets
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.longbench import \
|
||||
longbench_summary_groups
|
||||
from opencompass.configs.summarizers.groups.ruler import \
|
||||
ruler_summary_groups
|
||||
from opencompass.configs.summarizers.needlebench import (
|
||||
needlebench_8k_summarizer, needlebench_32k_summarizer,
|
||||
needlebench_128k_summarizer)
|
||||
|
||||
# Instruct models
|
||||
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
|
||||
models as lmdeploy_qwen2_7b_instruct_model
|
||||
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
|
||||
models as lmdeploy_internlm2_5_7b_1m_chat_model
|
||||
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
||||
models as llama3_1_8b_instruct_model
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
|
||||
needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
|
||||
needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
|
||||
|
||||
# Instruct models summarizer
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['ruler_8k', 'naive_average'],
|
||||
['ruler_32k', 'naive_average'],
|
||||
['ruler_128k', 'naive_average'],
|
||||
['NeedleBench-Overall-Score-8K', 'weighted_average'],
|
||||
['NeedleBench-Overall-Score-32K', 'weighted_average'],
|
||||
['NeedleBench-Overall-Score-128K', 'weighted_average'],
|
||||
['longbench', 'naive_average'],
|
||||
['longbench_zh', 'naive_average'],
|
||||
['longbench_en', 'naive_average'],
|
||||
'',
|
||||
'longbench_single-document-qa',
|
||||
'longbench_multi-document-qa',
|
||||
'longbench_summarization',
|
||||
'longbench_few-shot-learning',
|
||||
'longbench_synthetic-tasks',
|
||||
'longbench_code-completion',
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
|
||||
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
|
||||
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
|
||||
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
|
||||
lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
|
||||
|
||||
llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
|
||||
llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
|
||||
llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
|
||||
llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
|
||||
llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(
|
||||
type=NumWorkerPartitioner,
|
||||
num_worker=8
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
# eval with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLEvalTask)),
|
||||
)
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
base_exp_dir = 'outputs/corebench/'
|
||||
work_dir = osp.join(base_exp_dir, 'long_context')
|
134
configs/eval_corebench_2409_subjective.py
Normal file
134
configs/eval_corebench_2409_subjective.py
Normal file
@ -0,0 +1,134 @@
|
||||
import os.path as osp
|
||||
from copy import deepcopy
|
||||
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import (HuggingFacewithChatTemplate,
|
||||
TurboMindModelwithChatTemplate)
|
||||
from opencompass.models.openai_api import OpenAI, OpenAISDK
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import DLCRunner, LocalRunner
|
||||
from opencompass.summarizers import SubjectiveSummarizer
|
||||
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
|
||||
arenahard_datasets
|
||||
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
|
||||
alignbench_datasets
|
||||
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
|
||||
mtbench_datasets
|
||||
|
||||
# Summarizer
|
||||
|
||||
# Model List
|
||||
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
|
||||
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='internlm2_5-7b-chat-turbomind',
|
||||
path='internlm/internlm2_5-7b-chat',
|
||||
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096),
|
||||
max_seq_len=16384,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(
|
||||
type=NumWorkerPartitioner,
|
||||
num_worker=8
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask)
|
||||
),
|
||||
)
|
||||
|
||||
# JudgeLLM
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
])
|
||||
|
||||
|
||||
judge_models = [
|
||||
dict(
|
||||
type=OpenAISDK,
|
||||
abbr='gpt-4o-2024-08-06',
|
||||
path='gpt-4o-2024-08-06',
|
||||
# openai_api_base=
|
||||
# 'http://10.140.1.86:10001/v1', # Change to your own url if needed.
|
||||
key='YOUR_API_KEY',
|
||||
retry=10,
|
||||
meta_template=api_meta_template,
|
||||
rpm_verbose=True,
|
||||
query_per_second=1,
|
||||
max_out_len=4096,
|
||||
max_seq_len=16384,
|
||||
batch_size=16,
|
||||
temperature=0.01,
|
||||
tokenizer_path='gpt-4o-2024-08-06'
|
||||
)
|
||||
]
|
||||
|
||||
# Evaluation with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
base_exp_dir = 'outputs/corebench/'
|
||||
work_dir = osp.join(base_exp_dir, 'chat_subjective')
|
7
configs/eval_dingo.py
Normal file
7
configs/eval_dingo.py
Normal file
@ -0,0 +1,7 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .models.hf_internlm.hf_internlm_7b import models
|
||||
from .datasets.dingo.dingo_gen import datasets
|
||||
|
||||
work_dir = './outputs/eval_dingo'
|
@ -1,69 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import LmdeployPytorchModel
|
||||
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
||||
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
|
||||
# and output the results in a choosen format
|
||||
from opencompass.configs.summarizers.medium import summarizer
|
||||
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
eos_token_id=103028)
|
||||
|
||||
# config for internlm-chat-7b
|
||||
internlm_chat_7b = dict(
|
||||
type=LmdeployPytorchModel,
|
||||
abbr='internlm-chat-7b-pytorch',
|
||||
path='internlm/internlm-chat-7b',
|
||||
engine_config=dict(session_len=2048,
|
||||
max_batch_size=16),
|
||||
gen_config=dict(top_k=1,
|
||||
top_p=0.8,
|
||||
temperature=1.0,
|
||||
max_new_tokens=100),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=16,
|
||||
concurrency=16,
|
||||
meta_template=meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>',
|
||||
)
|
||||
|
||||
# config for internlm-chat-20b
|
||||
internlm_chat_20b = dict(
|
||||
type=LmdeployPytorchModel,
|
||||
abbr='internlm-chat-20b-pytorch',
|
||||
path='internlm/internlm-chat-20b',
|
||||
engine_config=dict(session_len=2048,
|
||||
max_batch_size=8),
|
||||
gen_config=dict(top_k=1,
|
||||
top_p=0.8,
|
||||
temperature=1.0,
|
||||
max_new_tokens=100),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
concurrency=8,
|
||||
meta_template=meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>',
|
||||
)
|
||||
|
||||
models = [internlm_chat_20b]
|
@ -1,41 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.lmdeploy_tis import LmdeployTisModel
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
||||
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
|
||||
# and output the results in a choosen format
|
||||
from opencompass.configs.summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||
],
|
||||
eos_token_id=92542
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=LmdeployTisModel,
|
||||
abbr='internlm-chat-20b-lmdeploy-tis',
|
||||
path='internlm/internlm-chat-20b',
|
||||
tis_addr='0.0.0.0:33337',
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
@ -1,40 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.turbomind_tis import TurboMindTisModel
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
||||
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
|
||||
# and output the results in a choosen format
|
||||
from opencompass.configs.summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
|
||||
meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
eos_token_id=103028)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindTisModel,
|
||||
abbr='internlm-chat-20b-turbomind',
|
||||
path='internlm',
|
||||
tis_addr='0.0.0.0:33337',
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
@ -1,28 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.turbomind_tis import TurboMindTisModel
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
# and output the results in a choosen format
|
||||
from opencompass.configs.summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindTisModel,
|
||||
abbr='internlm-chat-20b-turbomind',
|
||||
path='internlm',
|
||||
tis_addr='0.0.0.0:33337',
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
30
configs/models/bailing_api/bailing-lite-0830.py
Normal file
30
configs/models/bailing_api/bailing-lite-0830.py
Normal file
@ -0,0 +1,30 @@
|
||||
from opencompass.models import BailingAPI
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=False),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
path='Bailing-Lite-0830',
|
||||
token='', # set your key here or in environment variable BAILING_API_KEY
|
||||
url='https://bailingchat.alipay.com/chat/completions',
|
||||
type=BailingAPI,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
generation_kwargs={
|
||||
'temperature': 0.4,
|
||||
'top_p': 1.0,
|
||||
'top_k': -1,
|
||||
'n': 1,
|
||||
'logprobs': 1,
|
||||
'use_beam_search': False,
|
||||
},
|
||||
),
|
||||
]
|
30
configs/models/bailing_api/bailing-pro-0920.py
Normal file
30
configs/models/bailing_api/bailing-pro-0920.py
Normal file
@ -0,0 +1,30 @@
|
||||
from opencompass.models import BailingAPI
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=False),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
path='Bailing-Pro-0920',
|
||||
token='', # set your key here or in environment variable BAILING_API_KEY
|
||||
url='https://bailingchat.alipay.com/chat/completions',
|
||||
type=BailingAPI,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
generation_kwargs={
|
||||
'temperature': 0.4,
|
||||
'top_p': 1.0,
|
||||
'top_k': -1,
|
||||
'n': 1,
|
||||
'logprobs': 1,
|
||||
'use_beam_search': False,
|
||||
},
|
||||
),
|
||||
]
|
@ -1,15 +1,24 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='internlm2-chat-7b-turbomind',
|
||||
abbr=f'internlm2-chat-7b-lmdeploy',
|
||||
path='internlm/internlm2-chat-7b',
|
||||
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
||||
# If the model is not supported by 'turbomind', it will fallback to
|
||||
# 'pytorch'
|
||||
backend='turbomind',
|
||||
# For the detailed engine config and generation config, please refer to
|
||||
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
||||
engine_config=dict(tp=1),
|
||||
gen_config=dict(do_sample=False),
|
||||
max_seq_len=8192,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
# the max number of prompts that LMDeploy receives
|
||||
# in `generate` function
|
||||
batch_size=5000,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-1.5b-turbomind',
|
||||
path='Qwen/Qwen2.5-1.5B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
Normal file
15
configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-7b-turbomind',
|
||||
path='Qwen/Qwen2.5-7B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
88
docs/en/advanced_guides/evaluation_lmdeploy.md
Normal file
88
docs/en/advanced_guides/evaluation_lmdeploy.md
Normal file
@ -0,0 +1,88 @@
|
||||
# Evaluation with LMDeploy
|
||||
|
||||
We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
|
||||
|
||||
## Setup
|
||||
|
||||
### Install OpenCompass
|
||||
|
||||
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
|
||||
|
||||
### Install LMDeploy
|
||||
|
||||
Install lmdeploy via pip (python 3.8+)
|
||||
|
||||
```shell
|
||||
pip install lmdeploy
|
||||
```
|
||||
|
||||
The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
|
||||
|
||||
```shell
|
||||
export LMDEPLOY_VERSION=0.6.0
|
||||
export PYTHON_VERSION=310
|
||||
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
|
||||
|
||||
Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
|
||||
|
||||
```python
|
||||
# configure the dataset
|
||||
from mmengine.config import read_base
|
||||
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
||||
gsm8k_datasets
|
||||
# and output the results in a chosen format
|
||||
from .summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
# configure lmdeploy
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
|
||||
|
||||
# configure the model
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr=f'internlm2-chat-7b-lmdeploy',
|
||||
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
|
||||
path='internlm/internlm2-chat-7b',
|
||||
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
||||
# If the model is not supported by 'turbomind', it will fallback to
|
||||
# 'pytorch'
|
||||
backend='turbomind',
|
||||
# For the detailed engine config and generation config, please refer to
|
||||
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
||||
engine_config=dict(tp=1),
|
||||
gen_config=dict(do_sample=False),
|
||||
# the max size of the context window
|
||||
max_seq_len=7168,
|
||||
# the max number of new tokens
|
||||
max_out_len=1024,
|
||||
# the max number of prompts that LMDeploy receives
|
||||
# in `generate` function
|
||||
batch_size=5000,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
|
||||
```
|
||||
|
||||
You are expected to get the evaluation results after the inference and evaluation.
|
@ -1,78 +0,0 @@
|
||||
# Evaluation with LMDeploy
|
||||
|
||||
We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass.
|
||||
|
||||
## Setup
|
||||
|
||||
### Install OpenCompass
|
||||
|
||||
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
|
||||
|
||||
### Install LMDeploy
|
||||
|
||||
Install lmdeploy via pip (python 3.8+)
|
||||
|
||||
```shell
|
||||
pip install lmdeploy
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
OpenCompass integrates turbomind's python API for evaluation.
|
||||
|
||||
We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`:
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.turbomind import TurboMindModel
|
||||
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
# and output the results in a chosen format
|
||||
from .summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
# config for internlm-20b model
|
||||
internlm_20b = dict(
|
||||
type=TurboMindModel,
|
||||
abbr='internlm-20b-turbomind',
|
||||
path="internlm/internlm-20b", # this path should be same as in huggingface
|
||||
engine_config=dict(session_len=2048,
|
||||
max_batch_size=8,
|
||||
rope_scaling_factor=1.0),
|
||||
gen_config=dict(top_k=1, top_p=0.8,
|
||||
temperature=1.0,
|
||||
max_new_tokens=100),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
concurrency=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>'
|
||||
)
|
||||
|
||||
models = [internlm_20b]
|
||||
```
|
||||
|
||||
Then, in the home folder of OpenCompass, start evaluation by the following command:
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
|
||||
```
|
||||
|
||||
You are expected to get the evaluation results after the inference and evaluation.
|
||||
|
||||
**Note**:
|
||||
|
||||
- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
|
||||
and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
|
||||
- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
|
||||
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.
|
86
docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
Normal file
86
docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
Normal file
@ -0,0 +1,86 @@
|
||||
# 使用 LMDeploy 加速评测
|
||||
|
||||
我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案,拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。
|
||||
|
||||
## 环境配置
|
||||
|
||||
### 安装 OpenCompass
|
||||
|
||||
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
|
||||
|
||||
### 安装 LMDeploy
|
||||
|
||||
使用 pip 安装 LMDeploy (python 3.8+):
|
||||
|
||||
```shell
|
||||
pip install lmdeploy
|
||||
```
|
||||
|
||||
LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令:
|
||||
|
||||
```shell
|
||||
export LMDEPLOY_VERSION=0.6.0
|
||||
export PYTHON_VERSION=310
|
||||
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
## 评测
|
||||
|
||||
在评测一个模型时,需要准备一份评测配置,指明评测集、模型和推理参数等信息。
|
||||
|
||||
以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,相关的配置信息如下:
|
||||
|
||||
```python
|
||||
# configure the dataset
|
||||
from mmengine.config import read_base
|
||||
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
||||
gsm8k_datasets
|
||||
# and output the results in a chosen format
|
||||
from .summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
# configure lmdeploy
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
|
||||
|
||||
# configure the model
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr=f'internlm2-chat-7b-lmdeploy',
|
||||
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
|
||||
path='internlm/internlm2-chat-7b',
|
||||
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
||||
# If the model is not supported by 'turbomind', it will fallback to
|
||||
# 'pytorch'
|
||||
backend='turbomind',
|
||||
# For the detailed engine config and generation config, please refer to
|
||||
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
||||
engine_config=dict(tp=1),
|
||||
gen_config=dict(do_sample=False),
|
||||
# the max size of the context window
|
||||
max_seq_len=7168,
|
||||
# the max number of new tokens
|
||||
max_out_len=1024,
|
||||
# the max number of prompts that LMDeploy receives
|
||||
# in `generate` function
|
||||
batch_size=32,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
把上述配置放在文件中,比如 "configs/eval_internlm2_lmdeploy.py"。然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
|
||||
```
|
@ -1,75 +0,0 @@
|
||||
# 评测 LMDeploy 模型
|
||||
|
||||
我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配,本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。
|
||||
|
||||
## 环境配置
|
||||
|
||||
### 安装 OpenCompass
|
||||
|
||||
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
|
||||
|
||||
### 安装 LMDeploy
|
||||
|
||||
使用 pip 安装 LMDeploy (python 3.8+):
|
||||
|
||||
```shell
|
||||
pip install lmdeploy
|
||||
```
|
||||
|
||||
## 评测
|
||||
|
||||
OpenCompass 支持分别通过 turbomind python API 评测数据集。
|
||||
|
||||
下文以 InternLM-20B 模型为例,介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`:
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.turbomind import TurboMindModel
|
||||
|
||||
|
||||
with read_base():
|
||||
# choose a list of datasets
|
||||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
# and output the results in a chosen format
|
||||
from .summarizers.medium import summarizer
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
||||
# config for internlm-20b model
|
||||
internlm_20b = dict(
|
||||
type=TurboMindModel,
|
||||
abbr='internlm-20b-turbomind',
|
||||
path="internlm/internlm-20b", # 注意路径与huggingface保持一致
|
||||
engine_config=dict(session_len=2048,
|
||||
max_batch_size=8,
|
||||
rope_scaling_factor=1.0),
|
||||
gen_config=dict(top_k=1, top_p=0.8,
|
||||
temperature=1.0,
|
||||
max_new_tokens=100),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
concurrency=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>'
|
||||
)
|
||||
|
||||
models = [internlm_20b]
|
||||
```
|
||||
|
||||
然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
|
||||
```
|
||||
|
||||
**注:**
|
||||
|
||||
- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
|
||||
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py`
|
||||
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。
|
@ -1 +1 @@
|
||||
__version__ = '0.3.2.post1'
|
||||
__version__ = '0.3.3'
|
||||
|
34
opencompass/configs/datasets/dingo/dingo_gen.py
Normal file
34
opencompass/configs/datasets/dingo/dingo_gen.py
Normal file
@ -0,0 +1,34 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import DingoDataset, DingoEvaluator
|
||||
|
||||
|
||||
dingo_paths = [
|
||||
'./data/dingo/en_192.csv',
|
||||
'./data/dingo/zh_170.csv',
|
||||
]
|
||||
|
||||
dingo_datasets = []
|
||||
for path in dingo_paths:
|
||||
dingo_reader_cfg = dict(input_columns='input', output_column=None)
|
||||
dingo_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
|
||||
|
||||
dingo_datasets.append(
|
||||
dict(
|
||||
abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
|
||||
type=DingoDataset,
|
||||
path=path,
|
||||
reader_cfg=dingo_reader_cfg,
|
||||
infer_cfg=dingo_infer_cfg,
|
||||
eval_cfg=dingo_eval_cfg,
|
||||
))
|
||||
|
||||
datasets = dingo_datasets
|
@ -15,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/followbench/converted_data'
|
||||
|
||||
followbench_llmeval_dataset = []
|
||||
followbench_llmeval_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
followbench_llmeval_dataset.append(
|
||||
followbench_llmeval_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=FollowBenchDataset,
|
||||
|
@ -0,0 +1,73 @@
|
||||
import copy
|
||||
|
||||
from opencompass.datasets import WikiBenchDataset
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
|
||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
single_choice_prompts = {
|
||||
'single_choice_cn': [
|
||||
dict(role='HUMAN',
|
||||
prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
|
||||
),
|
||||
dict(role='BOT', prompt='回答: B'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
|
||||
),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
|
||||
),
|
||||
dict(role='BOT', prompt='回答: B'),
|
||||
dict(role='HUMAN',
|
||||
prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
|
||||
dict(role='BOT', prompt='回答: C'),
|
||||
dict(role='HUMAN', prompt='问题: {question}'),
|
||||
dict(role='BOT', prompt='回答: {answer}'),
|
||||
]
|
||||
}
|
||||
|
||||
wikibench_sets = {
|
||||
'wiki': ['single_choice_cn'],
|
||||
}
|
||||
|
||||
do_circular = True
|
||||
|
||||
wikibench_datasets = []
|
||||
|
||||
for _split in list(wikibench_sets.keys()):
|
||||
for _name in wikibench_sets[_split]:
|
||||
template = {}
|
||||
for answer in ['A', 'B', 'C', 'D']:
|
||||
one_template_round = copy.deepcopy(single_choice_prompts[_name])
|
||||
one_template_round[-1]['prompt'] = one_template_round[-1][
|
||||
'prompt'].format(answer=answer)
|
||||
template[answer] = dict(round=one_template_round)
|
||||
wikibench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=template),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
wikibench_eval_cfg = dict(evaluator=dict(
|
||||
type=CircularEvaluator if do_circular else AccEvaluator), )
|
||||
wikibench_datasets.append(
|
||||
dict(
|
||||
type=WikiBenchDataset,
|
||||
path=f'./data/WikiBench/{_name}.jsonl',
|
||||
name='circular_' + _name if do_circular else _name,
|
||||
abbr='wikibench-' + _split + '-' + _name +
|
||||
'circular' if do_circular else '',
|
||||
reader_cfg=dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer',
|
||||
),
|
||||
infer_cfg=wikibench_infer_cfg,
|
||||
eval_cfg=wikibench_eval_cfg,
|
||||
))
|
30
opencompass/configs/models/bailing_api/bailing-lite-0830.py
Normal file
30
opencompass/configs/models/bailing_api/bailing-lite-0830.py
Normal file
@ -0,0 +1,30 @@
|
||||
from opencompass.models import BailingAPI
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=False),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
path='Bailing-Lite-0830',
|
||||
token='', # set your key here or in environment variable BAILING_API_KEY
|
||||
url='https://bailingchat.alipay.com/chat/completions',
|
||||
type=BailingAPI,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
generation_kwargs={
|
||||
'temperature': 0.4,
|
||||
'top_p': 1.0,
|
||||
'top_k': -1,
|
||||
'n': 1,
|
||||
'logprobs': 1,
|
||||
'use_beam_search': False,
|
||||
},
|
||||
),
|
||||
]
|
30
opencompass/configs/models/bailing_api/bailing-pro-0920.py
Normal file
30
opencompass/configs/models/bailing_api/bailing-pro-0920.py
Normal file
@ -0,0 +1,30 @@
|
||||
from opencompass.models import BailingAPI
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=False),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
path='Bailing-Pro-0920',
|
||||
token='', # set your key here or in environment variable BAILING_API_KEY
|
||||
url='https://bailingchat.alipay.com/chat/completions',
|
||||
type=BailingAPI,
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
generation_kwargs={
|
||||
'temperature': 0.4,
|
||||
'top_p': 1.0,
|
||||
'top_k': -1,
|
||||
'n': 1,
|
||||
'logprobs': 1,
|
||||
'use_beam_search': False,
|
||||
},
|
||||
),
|
||||
]
|
@ -1,15 +1,24 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='internlm2-chat-7b-turbomind',
|
||||
abbr=f'internlm2-chat-7b-lmdeploy',
|
||||
path='internlm/internlm2-chat-7b',
|
||||
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
|
||||
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
||||
# If the model is not supported by 'turbomind', it will fallback to
|
||||
# 'pytorch'
|
||||
backend='turbomind',
|
||||
# For the detailed engine config and generation config, please refer to
|
||||
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
||||
engine_config=dict(tp=1),
|
||||
gen_config=dict(do_sample=False),
|
||||
max_seq_len=8192,
|
||||
max_out_len=4096,
|
||||
batch_size=16,
|
||||
# the max number of prompts that LMDeploy receives
|
||||
# in `generate` function
|
||||
batch_size=5000,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
||||
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-1.5b-turbomind',
|
||||
path='Qwen/Qwen2.5-1.5B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
Normal file
15
opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
Normal file
@ -0,0 +1,15 @@
|
||||
from opencompass.models import TurboMindModel
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModel,
|
||||
abbr='qwen2.5-7b-turbomind',
|
||||
path='Qwen/Qwen2.5-7B',
|
||||
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
|
||||
max_seq_len=7168,
|
||||
max_out_len=1024,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
)
|
||||
]
|
@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, name: str):
|
||||
data = get_data_path(path, local_mode=True)
|
||||
path = get_data_path(path, local_mode=True)
|
||||
if environ.get('DATASET_SOURCE') == 'ModelScope':
|
||||
from modelscope import MsDataset
|
||||
return MsDataset.load(path, subset_name=name, split='test')
|
||||
|
@ -33,6 +33,7 @@ from .crowspairs_cn import * # noqa: F401, F403
|
||||
from .csl import * # noqa: F401, F403
|
||||
from .custom import * # noqa: F401, F403
|
||||
from .cvalues import * # noqa: F401, F403
|
||||
from .dingo import * # noqa: F401, F403
|
||||
from .drcd import * # noqa: F401, F403
|
||||
from .drop import * # noqa: F401, F403
|
||||
from .drop_simple_eval import * # noqa: F401, F403
|
||||
|
84
opencompass/datasets/dingo.py
Normal file
84
opencompass/datasets/dingo.py
Normal file
@ -0,0 +1,84 @@
|
||||
# flake8: nodingo
|
||||
# yapf: disable
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class DingoDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
raw_data = []
|
||||
with open(path, encoding='utf-8') as f:
|
||||
reader = csv.reader(f, delimiter=';')
|
||||
for row in reader:
|
||||
if len(row) < 1:
|
||||
row = ['']
|
||||
raw_data.append({'input': row[0]})
|
||||
return Dataset.from_list(raw_data)
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class DingoLongDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
raw_data = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
raw_data.append({'input': json.loads(line).get('input')})
|
||||
return Dataset.from_list(raw_data)
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class DingoEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, origin_prompt: List, predictions: List) -> dict:
|
||||
try:
|
||||
# from dingo.model.model import Model
|
||||
from dingo.exec import Executor
|
||||
from dingo.io import InputArgs
|
||||
except Exception:
|
||||
raise ModuleNotFoundError(
|
||||
'=========== '
|
||||
'dingo register fail. please try: pip install dingo-python.'
|
||||
' ===========')
|
||||
|
||||
current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
|
||||
file_data = [{'prompt': pmt, 'prediction': prd}
|
||||
for pmt, prd in zip(origin_prompt, predictions)]
|
||||
file_name = 'dingo_file_' + current_time + '.jsonl'
|
||||
with open(file_name, 'a', encoding='utf-8') as f:
|
||||
for d in file_data:
|
||||
json.dump(d, f, ensure_ascii=False)
|
||||
f.write('\n')
|
||||
|
||||
input_data = {
|
||||
'eval_models': ['llm_base'],
|
||||
'input_path': file_name,
|
||||
'output_path': './outputs/dingo/',
|
||||
'dataset': 'local',
|
||||
'datasource': 'local',
|
||||
'data_format': 'jsonl',
|
||||
'column_prompt': ['prompt'],
|
||||
'column_content': ['prediction'],
|
||||
}
|
||||
# Model.apply_config(input_data["custom_config_path"])
|
||||
input_args = InputArgs(**input_data)
|
||||
executor = Executor.exec_map['local'](input_args)
|
||||
result = executor.execute()
|
||||
summary = result[0].to_dict()
|
||||
|
||||
os.remove(file_name)
|
||||
return summary
|
@ -3,6 +3,7 @@ from .ai360_api import AI360GPT # noqa: F401
|
||||
from .alaya import AlayaLM # noqa: F401
|
||||
from .baichuan_api import BaiChuan # noqa: F401
|
||||
from .baidu_api import ERNIEBot # noqa: F401
|
||||
from .bailing_api_oc import BailingAPI # noqa: F401
|
||||
from .base import BaseModel, LMTemplateParser # noqa: F401
|
||||
from .base_api import APITemplateParser, BaseAPIModel # noqa: F401
|
||||
from .bytedance_api import ByteDance # noqa: F401
|
||||
@ -24,8 +25,6 @@ from .interntrain import InternTrain # noqa: F401
|
||||
from .krgpt_api import KrGPT # noqa: F401
|
||||
from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401
|
||||
from .llama2 import Llama2, Llama2Chat # noqa: F401
|
||||
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
|
||||
from .lmdeploy_tis import LmdeployTisModel # noqa: F401
|
||||
from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401
|
||||
from .mistral_api import Mistral # noqa: F401
|
||||
from .mixtral import Mixtral # noqa: F401
|
||||
@ -40,7 +39,6 @@ from .rendu_api import Rendu # noqa: F401
|
||||
from .sensetime_api import SenseTime # noqa: F401
|
||||
from .stepfun_api import StepFun # noqa: F401
|
||||
from .turbomind import TurboMindModel # noqa: F401
|
||||
from .turbomind_tis import TurboMindTisModel # noqa: F401
|
||||
from .turbomind_with_tf_above_v4_33 import \
|
||||
TurboMindModelwithChatTemplate # noqa: F401
|
||||
from .unigpt_api import UniGPT # noqa: F401
|
||||
|
225
opencompass/models/bailing_api_oc.py
Normal file
225
opencompass/models/bailing_api_oc.py
Normal file
@ -0,0 +1,225 @@
|
||||
import concurrent
|
||||
import concurrent.futures
|
||||
import os
|
||||
import socket
|
||||
import traceback
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.connection import HTTPConnection
|
||||
|
||||
try:
|
||||
from retrying import retry
|
||||
except ImportError:
|
||||
retry = None
|
||||
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
from .base_api import BaseAPIModel
|
||||
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
class HTTPAdapterWithSocketOptions(HTTPAdapter):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._socket_options = HTTPConnection.default_socket_options + [
|
||||
(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
|
||||
(socket.SOL_TCP, socket.TCP_KEEPIDLE, 75),
|
||||
(socket.SOL_TCP, socket.TCP_KEEPINTVL, 30),
|
||||
(socket.SOL_TCP, socket.TCP_KEEPCNT, 120),
|
||||
]
|
||||
super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs)
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
if self._socket_options is not None:
|
||||
kwargs['socket_options'] = self._socket_options
|
||||
super(HTTPAdapterWithSocketOptions,
|
||||
self).init_poolmanager(*args, **kwargs)
|
||||
|
||||
|
||||
class BailingAPI(BaseAPIModel):
|
||||
"""Model wrapper around Bailing Service.
|
||||
|
||||
Args:
|
||||
ouput_key (str): key for prediction
|
||||
query_per_second (int): The maximum queries allowed per second
|
||||
between two consecutive calls of the API. Defaults to 1.
|
||||
generation_kwargs: other params
|
||||
retry (int): Number of retires if the API call fails. Defaults to 2.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
token: str,
|
||||
url: str,
|
||||
meta_template: Optional[Dict] = None,
|
||||
query_per_second: int = 1,
|
||||
retry: int = 3,
|
||||
generation_kwargs: Dict = {},
|
||||
max_seq_len=4096,
|
||||
):
|
||||
super().__init__(
|
||||
path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
meta_template=meta_template,
|
||||
retry=retry,
|
||||
generation_kwargs=generation_kwargs,
|
||||
)
|
||||
|
||||
self.logger.info(f'Bailing API Model Init path: {path} url={url}')
|
||||
if not token:
|
||||
token = os.environ.get('BAILING_API_KEY')
|
||||
if token:
|
||||
self._headers = {'Authorization': f'Bearer {token}'}
|
||||
else:
|
||||
raise RuntimeError('There is not valid token.')
|
||||
else:
|
||||
self._headers = {'Authorization': f'Bearer {token}'}
|
||||
|
||||
self._headers['Content-Type'] = 'application/json'
|
||||
self._url = url if url else \
|
||||
'https://bailingchat.alipay.com/chat/completions'
|
||||
self._model = path
|
||||
self._sessions = []
|
||||
self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
|
||||
if os.environ.get('BAILING_API_PARALLEL_NUM') else 1)
|
||||
try:
|
||||
for _ in range(self._num):
|
||||
adapter = HTTPAdapterWithSocketOptions()
|
||||
sess = requests.Session()
|
||||
sess.mount('http://', adapter)
|
||||
sess.mount('https://', adapter)
|
||||
self._sessions.append(sess)
|
||||
except Exception as e:
|
||||
self.logger.error(f'Fail to setup the session. {e}')
|
||||
raise e
|
||||
|
||||
def generate(
|
||||
self,
|
||||
inputs: Union[List[str], PromptList],
|
||||
max_out_len: int = 4096,
|
||||
) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (Union[List[str], PromptList]):
|
||||
A list of strings or PromptDicts.
|
||||
The PromptDict should be organized in OpenCompass' API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=self._num, ) as executor:
|
||||
future_to_m = {
|
||||
executor.submit(
|
||||
self._generate,
|
||||
self._sessions[i % self._num],
|
||||
input,
|
||||
max_out_len,
|
||||
): i
|
||||
for i, input in enumerate(inputs)
|
||||
}
|
||||
results = []
|
||||
for future in concurrent.futures.as_completed(future_to_m):
|
||||
m = future_to_m[future] # noqa F841
|
||||
resp = future.result()
|
||||
if resp and resp.status_code == 200:
|
||||
try:
|
||||
result = resp.json()
|
||||
except Exception as e: # noqa F841
|
||||
results.append('')
|
||||
else:
|
||||
if (result.get('choices')
|
||||
and result['choices'][0].get('message')
|
||||
and result['choices'][0]['message'].get(
|
||||
'content')):
|
||||
results.append(
|
||||
result['choices'][0]['message']['content'])
|
||||
else:
|
||||
results.append('')
|
||||
self.flush()
|
||||
return results
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
sess,
|
||||
input: Union[str, PromptList],
|
||||
max_out_len: int,
|
||||
) -> str:
|
||||
"""Generate results given an input.
|
||||
|
||||
Args:
|
||||
inputs (str or PromptList): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass' API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
if isinstance(input, str):
|
||||
messages = [{'role': 'user', 'content': input}]
|
||||
else:
|
||||
messages = []
|
||||
for item in input:
|
||||
content = item['prompt']
|
||||
if not content:
|
||||
continue
|
||||
message = {'content': content}
|
||||
if item['role'] == 'HUMAN':
|
||||
message['role'] = 'user'
|
||||
elif item['role'] == 'BOT':
|
||||
message['role'] = 'assistant'
|
||||
elif item['role'] == 'SYSTEM':
|
||||
message['role'] = 'system'
|
||||
else:
|
||||
message['role'] = item['role']
|
||||
messages.append(message)
|
||||
request = {
|
||||
'model':
|
||||
self._model,
|
||||
'messages':
|
||||
messages,
|
||||
'max_seq_len':
|
||||
max(
|
||||
max_out_len if max_out_len else 4096,
|
||||
self.max_seq_len if self.max_seq_len else 4096,
|
||||
),
|
||||
}
|
||||
request.update(self.generation_kwargs)
|
||||
try:
|
||||
retry_num = 0
|
||||
while retry_num < self.retry:
|
||||
response = self._infer_result(request, sess)
|
||||
if response.status_code == 200:
|
||||
break # success
|
||||
elif response.status_code == 426:
|
||||
retry_num += 1 # retry
|
||||
else:
|
||||
raise ValueError(f'Status code = {response.status_code}')
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Exceed the maximal retry times. Last status code '
|
||||
f'= {response.status_code}')
|
||||
except Exception as e:
|
||||
self.logger.error(f'Fail to inference request={request}; '
|
||||
f'model_name={self.path}; error={e}, '
|
||||
f'stack:{traceback.format_exc()}')
|
||||
raise e
|
||||
return response
|
||||
|
||||
# @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms
|
||||
def _infer_result(self, request, sess):
|
||||
response = sess.request(
|
||||
'POST',
|
||||
self._url,
|
||||
json=request,
|
||||
headers=self._headers,
|
||||
timeout=500,
|
||||
)
|
||||
return response
|
@ -79,6 +79,50 @@ class LegacyInternTrainManager(InternTrainManager):
|
||||
|
||||
@MODELS.register_module()
|
||||
class InternTrain(BaseModel):
|
||||
"""Model wrapper for InternTrain.
|
||||
|
||||
Args:
|
||||
path (str): The name or path to HuggingFace's model.
|
||||
module_path (str): Path of InternTrain repository.
|
||||
max_seq_len (int): The maximum length of the input sequence. Defaults
|
||||
to 2048.
|
||||
tokenizer_only (bool): If True, only the tokenizer will be initialized.
|
||||
Defaults to False.
|
||||
tokenizer_path (str): The path to the tokenizer. Defaults to None.
|
||||
tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'.
|
||||
model_config (str, dict, optional): Config of model. There are several
|
||||
options for this parameter:
|
||||
|
||||
- filename (str): The config items are defined in a python file
|
||||
so the model will load configs from this file.
|
||||
- config (dict): The configuration items are defined in a dict
|
||||
and the model will be initialized from ```model_config```.
|
||||
- None: The config is loaded from ```path```. In this case,
|
||||
please make sure that ```path``` contains a config file named
|
||||
``model_config.pt``.
|
||||
|
||||
Defaults to None.
|
||||
model_type: Type of model. Defaults to 'InternTrain'
|
||||
ckpt_type: The type of load function in InternTrain when checkpoints
|
||||
are loaded. Defaults to None, which means load the checkpoint
|
||||
directlywith pipeline merged.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
model_dtype: The model's dtype. If None, will use dtype defined in
|
||||
```model_config```. Defaults to None.
|
||||
generation_kwargs (Dict, optional): The generation kwargs for the
|
||||
model. Defaults to dict().
|
||||
sync_rank (bool): Whether to sync inputs between ranks. Do not use this
|
||||
if you are not familiar with this behavior. Check `sync_inputs`
|
||||
function for more details. Defaults to False.
|
||||
mode (str, optional): The method of input truncation when input length
|
||||
exceeds max_seq_len. 'mid' represents the part of input to
|
||||
truncate. Defaults to 'none'.
|
||||
end_str (str, optional): Whether to trim generated strings with end_str
|
||||
if the model has special ending strings that are not handled well.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
path: str,
|
||||
@ -87,14 +131,15 @@ class InternTrain(BaseModel):
|
||||
tokenizer_only: bool = False,
|
||||
tokenizer_path: Optional[str] = None,
|
||||
tokenizer_type: str = 'INTERNLM',
|
||||
model_config: Optional[str] = None,
|
||||
model_config: Optional[Union[str, Dict]] = None,
|
||||
model_type: str = 'INTERNLM2',
|
||||
ckpt_type: Optional[str] = None,
|
||||
meta_template: Optional[Dict] = None,
|
||||
model_dtype: Optional[str] = None,
|
||||
generation_kwargs={},
|
||||
sync_rank: bool = False,
|
||||
mode='none'):
|
||||
mode='none',
|
||||
end_str: Optional[str] = None):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
tokenizer_only=tokenizer_only,
|
||||
@ -146,6 +191,7 @@ class InternTrain(BaseModel):
|
||||
bos_token_id=self.tokenizer.bos_id,
|
||||
pad_token_id=self.tokenizer.bos_id,
|
||||
eos_token_id=eos_token_ids)
|
||||
self.end_str = end_str
|
||||
|
||||
def _load_model(self,
|
||||
path: str,
|
||||
@ -242,7 +288,7 @@ class InternTrain(BaseModel):
|
||||
else:
|
||||
raise NotImplementedError(f'Unknown model dtype {model_dtype}')
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int:
|
||||
"""Get lengths of the tokenized strings.
|
||||
|
||||
Args:
|
||||
@ -251,7 +297,7 @@ class InternTrain(BaseModel):
|
||||
Returns:
|
||||
int: Length of the input tokens
|
||||
"""
|
||||
tokens = self.tokenizer(prompt, use_bos=True, use_eos=True)
|
||||
tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos)
|
||||
return len(tokens)
|
||||
|
||||
def generate(self,
|
||||
@ -287,8 +333,10 @@ class InternTrain(BaseModel):
|
||||
max_length=tokens.shape[1] + max_out_len,
|
||||
**self.generation_kwargs) # bsz, num_return_sequences, max_length
|
||||
outputs = outputs[:, 0, tokens.shape[1]:]
|
||||
output_text = self.batch_decode(outputs,
|
||||
stopping_criteria=stopping_criteria)
|
||||
output_text = self.batch_decode(
|
||||
outputs,
|
||||
eos_token_ids=self.generator.eos_token_id,
|
||||
stopping_criteria=stopping_criteria)
|
||||
|
||||
return output_text
|
||||
|
||||
@ -343,7 +391,7 @@ class InternTrain(BaseModel):
|
||||
for input_text, cont in zip(input_texts, conts)
|
||||
]
|
||||
replaced_lens = [
|
||||
len(self.encode(input_text)[0]) for input_text in replaced_texts
|
||||
self.get_token_len(input_text) for input_text in replaced_texts
|
||||
]
|
||||
loglikelihoods = []
|
||||
for nloss, nlen, rlen in zip(loss, lens, replaced_lens):
|
||||
@ -407,11 +455,22 @@ class InternTrain(BaseModel):
|
||||
|
||||
return torch.LongTensor(tokens).cuda()
|
||||
|
||||
def batch_decode(self, outputs, stopping_criteria: List[str] = []):
|
||||
def batch_decode(self,
|
||||
outputs,
|
||||
eos_token_ids: List[int],
|
||||
stopping_criteria: List[str] = []):
|
||||
# outputs: bsz, seq_len
|
||||
output_text = []
|
||||
outputs = outputs.tolist()
|
||||
for output in outputs:
|
||||
text = self.tokenizer.decode(output.tolist())
|
||||
# cut off by eos_token_ids
|
||||
eos_idx = len(output)
|
||||
for eos_id in eos_token_ids:
|
||||
if eos_id in output:
|
||||
eos_idx = min(output.index(eos_id), eos_idx)
|
||||
text = self.tokenizer.decode(output[:eos_idx])
|
||||
if self.end_str is not None:
|
||||
text = text.split(self.end_str)[0]
|
||||
for stop_word in stopping_criteria:
|
||||
text = text.split(stop_word)[0]
|
||||
output_text.append(text)
|
||||
|
@ -1,188 +0,0 @@
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from opencompass.models.base import BaseModel
|
||||
from opencompass.utils.logging import get_logger
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
def valid_str(string, coding='utf-8'):
|
||||
"""decode text according to its encoding type."""
|
||||
invalid_chars = [b'\xef\xbf\xbd']
|
||||
bstr = bytes(string, coding)
|
||||
for invalid_char in invalid_chars:
|
||||
bstr = bstr.replace(invalid_char, b'')
|
||||
ret = bstr.decode(encoding=coding, errors='ignore')
|
||||
return ret
|
||||
|
||||
|
||||
class LmdeployPytorchModel(BaseModel):
|
||||
"""Model wrapper for lmdeploy pytorch engine through python API.
|
||||
|
||||
Args:
|
||||
path (str): path of the supported pytorch model.
|
||||
max_seq_len (int): The maximum allowed sequence length of a model.
|
||||
Note that the length of prompt + generated tokens shall not exceed
|
||||
this value. Defaults to 2048.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
engine_config (Dict, optional): The engine config to set
|
||||
arguments like session_len, max_batch_size for TurboMind.
|
||||
gen_config (Dict, optional): Generation config to set
|
||||
arguments like top_k, top_p, temperature.
|
||||
end_str (str, optional): Whether to trim generated strings with end_str
|
||||
if the model has special ending strings that are not handled well.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
path: str,
|
||||
concurrency: int = 8,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
engine_config: Optional[Dict] = None,
|
||||
gen_config: Optional[Dict] = None,
|
||||
end_str: Optional[str] = None):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template)
|
||||
from lmdeploy.pytorch import engine as tm
|
||||
from lmdeploy.version import version_info
|
||||
|
||||
if engine_config is not None:
|
||||
from lmdeploy.messages import PytorchEngineConfig
|
||||
engine_config = PytorchEngineConfig(**engine_config)
|
||||
# set thread_safe
|
||||
if hasattr(engine_config, 'thread_safe'):
|
||||
engine_config.thread_safe = True
|
||||
|
||||
if gen_config is not None:
|
||||
from lmdeploy.messages import GenerationConfig
|
||||
gen_config = GenerationConfig(**gen_config)
|
||||
|
||||
self.logger = get_logger()
|
||||
tm_model = tm.Engine(path, engine_config)
|
||||
self.tokenizer = tm_model.tokenizer
|
||||
self.generators = [
|
||||
tm_model.create_instance() for i in range(concurrency)
|
||||
]
|
||||
self.generator_ids = [i + 1 for i in range(concurrency)]
|
||||
|
||||
from transformers import GenerationConfig
|
||||
try:
|
||||
generation_config = GenerationConfig.from_pretrained(path)
|
||||
except Exception:
|
||||
generation_config = None
|
||||
if generation_config and hasattr(generation_config, 'eos_token_id'):
|
||||
if gen_config.stop_words is None:
|
||||
stop_words = []
|
||||
if isinstance(generation_config.eos_token_id, int):
|
||||
stop_words.append(generation_config.eos_token_id)
|
||||
else:
|
||||
assert isinstance(generation_config.eos_token_id, list)
|
||||
for token_id in generation_config.eos_token_id:
|
||||
stop_words.append(token_id)
|
||||
gen_config.stop_words = stop_words
|
||||
if version_info >= (0, 6, 0):
|
||||
gen_config.stop_token_ids = stop_words
|
||||
self.gen_config = gen_config
|
||||
self.end_str = end_str
|
||||
self.major_version, self.minor_version = version_info[:2]
|
||||
|
||||
def generate(
|
||||
self,
|
||||
inputs: List[str],
|
||||
max_out_len: int = 512,
|
||||
) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (List[str]): A list of prompts
|
||||
max_out_len (int): The maximum length of the output.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
assert isinstance(
|
||||
inputs, List), f'List(str) is expected, but got {type(inputs)}'
|
||||
|
||||
# split inputs into batches
|
||||
batch_size = len(self.generators)
|
||||
batch_inputs = [
|
||||
inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
|
||||
]
|
||||
|
||||
results = []
|
||||
for batch_input in batch_inputs:
|
||||
with ThreadPoolExecutor() as executor:
|
||||
_results = list(
|
||||
executor.map(
|
||||
self._generate,
|
||||
self.generators[:len(batch_input)],
|
||||
self.generator_ids[:len(batch_input)],
|
||||
batch_input,
|
||||
[self.gen_config] * len(batch_input),
|
||||
[self.end_str] * len(batch_input),
|
||||
))
|
||||
results += _results
|
||||
return results
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
input_ids = self.tokenizer.encode(prompt)
|
||||
return len(input_ids)
|
||||
|
||||
def wait(self):
|
||||
"""Wait till the next query can be sent.
|
||||
|
||||
Applicable in both single-thread and multi-thread environments.
|
||||
"""
|
||||
return self.token_bucket.get_token()
|
||||
|
||||
def _generate(self,
|
||||
generator,
|
||||
session_id,
|
||||
prompt: PromptType,
|
||||
gen_config=None,
|
||||
end_str: Optional[str] = None) -> str:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
prompt (PromptType): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
gen_config (GenerationConfig, optional): Generation
|
||||
config to set arguments like top_k, top_p, temperature.
|
||||
end_str (str, optional): Whether to trim generated strings
|
||||
with end_str if the model has special ending strings
|
||||
that are not handled well.
|
||||
Defaults to None.
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert type(
|
||||
prompt) is str, 'We only support string for TurboMind Python API'
|
||||
input_ids = self.tokenizer.encode(prompt)
|
||||
if self.major_version >= 0 and self.minor_version >= 4:
|
||||
outputs = generator.infer(session_id,
|
||||
input_ids,
|
||||
gen_config=gen_config)
|
||||
output_ids = outputs.token_ids
|
||||
else:
|
||||
_, output_ids, _ = generator.infer(session_id,
|
||||
input_ids,
|
||||
gen_config=gen_config)
|
||||
|
||||
# stop engine
|
||||
if hasattr(generator, 'end'):
|
||||
generator.end(session_id)
|
||||
# decode output
|
||||
response_all = self.tokenizer.decode(output_ids)
|
||||
# trim output
|
||||
if end_str:
|
||||
response_all = response_all.split(end_str)[0]
|
||||
# remove invalid characters
|
||||
response_all = valid_str(response_all)
|
||||
return response_all
|
@ -1,200 +0,0 @@
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from queue import Queue
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from opencompass.models.base import BaseModel, LMTemplateParser
|
||||
from opencompass.utils.logging import get_logger
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
def valid_str(string, coding='utf-8'):
|
||||
"""decode text according to its encoding type."""
|
||||
invalid_chars = [b'\xef\xbf\xbd']
|
||||
bstr = bytes(string, coding)
|
||||
for invalid_char in invalid_chars:
|
||||
bstr = bstr.replace(invalid_char, b'')
|
||||
ret = bstr.decode(encoding=coding, errors='ignore')
|
||||
return ret
|
||||
|
||||
|
||||
def prepare_tensor(name, input_tensor):
|
||||
"""Create grpcclient's InferInput instance according to a given tensor."""
|
||||
import tritonclient.grpc as grpcclient
|
||||
from tritonclient.utils import np_to_triton_dtype
|
||||
t = grpcclient.InferInput(name, list(input_tensor.shape),
|
||||
np_to_triton_dtype(input_tensor.dtype))
|
||||
t.set_data_from_numpy(input_tensor)
|
||||
return t
|
||||
|
||||
|
||||
def stream_callback(que, result, error):
|
||||
"""callback function invoked by triton client."""
|
||||
que.put((result, error))
|
||||
|
||||
|
||||
class LmdeployTisModel(BaseModel):
|
||||
"""Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC
|
||||
API.
|
||||
|
||||
Args:
|
||||
path (str): The name of OpenAI's model.
|
||||
tis_addr (str): The address (ip:port format) of turbomind's
|
||||
triton inference server
|
||||
max_seq_len (int): The maximum allowed sequence length of a model.
|
||||
Note that the length of prompt + generated tokens shall not exceed
|
||||
this value. Defaults to 2048.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
"""
|
||||
|
||||
is_api: bool = True
|
||||
|
||||
def __init__(self,
|
||||
path: str,
|
||||
tis_addr: str = '0.0.0.0:33337',
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
end_str: Optional[str] = None):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template)
|
||||
from lmdeploy.tokenizer import Tokenizer
|
||||
|
||||
self.logger = get_logger()
|
||||
self.template_parser = LMTemplateParser(meta_template)
|
||||
self.eos_token_id = None
|
||||
if meta_template and 'eos_token_id' in meta_template:
|
||||
self.eos_token_id = meta_template['eos_token_id']
|
||||
self.tis_addr = tis_addr
|
||||
self.tokenizer = Tokenizer(path)
|
||||
self.end_str = end_str
|
||||
|
||||
def generate(
|
||||
self,
|
||||
inputs: List[str or PromptList],
|
||||
max_out_len: int = 512,
|
||||
temperature: float = 1.0,
|
||||
) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (List[str or PromptList]): A list of strings or PromptDicts.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic. Defaults to 0.7.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
results = list(
|
||||
executor.map(self._generate, inputs,
|
||||
[max_out_len] * len(inputs),
|
||||
[temperature] * len(inputs),
|
||||
[self.end_str] * len(inputs)))
|
||||
return results
|
||||
|
||||
def wait(self):
|
||||
"""Wait till the next query can be sent.
|
||||
|
||||
Applicable in both single-thread and multi-thread environments.
|
||||
"""
|
||||
return self.token_bucket.get_token()
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
input_ids = self.tokenizer.encode(prompt)
|
||||
return len(input_ids)
|
||||
|
||||
def _call_triton_server(self, prompt, tis_addr, session_id,
|
||||
request_output_len, temperature, res_que):
|
||||
import tritonclient.grpc as grpcclient
|
||||
|
||||
with grpcclient.InferenceServerClient(tis_addr) as client:
|
||||
inputs = [
|
||||
prepare_tensor('prompt',
|
||||
np.array([prompt.encode()], dtype=np.object_)),
|
||||
prepare_tensor('max_tokens',
|
||||
np.array([request_output_len], dtype=np.int32)),
|
||||
prepare_tensor('temperature',
|
||||
np.array([temperature], dtype=np.float_)),
|
||||
prepare_tensor('top_p', np.array([1.0], dtype=np.float_)),
|
||||
prepare_tensor('top_k', np.array([1], dtype=np.int32)),
|
||||
prepare_tensor('ignore_eos', np.array([False],
|
||||
dtype=np.bool_)),
|
||||
prepare_tensor('stream', np.array([True], dtype=np.bool_)),
|
||||
]
|
||||
|
||||
# async_stream
|
||||
client.start_stream(partial(stream_callback, res_que))
|
||||
client.async_stream_infer('lmdeploy_model',
|
||||
inputs,
|
||||
sequence_id=session_id,
|
||||
sequence_start=True,
|
||||
sequence_end=True)
|
||||
|
||||
res_que.put(None)
|
||||
return
|
||||
|
||||
def _process_result(self, que):
|
||||
text = ''
|
||||
while True:
|
||||
res = que.get()
|
||||
if res is not None:
|
||||
result, err = res
|
||||
if err is not None:
|
||||
print(err)
|
||||
else:
|
||||
res = result.as_numpy('response').item().decode()
|
||||
text += res
|
||||
else:
|
||||
return text
|
||||
|
||||
def _generate(self,
|
||||
prompt: str or PromptList,
|
||||
max_out_len: int,
|
||||
temperature: float,
|
||||
end_str: Optional[str] = None) -> str:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
prompt (str or PromptList): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert type(
|
||||
prompt
|
||||
) is str, 'We only support string for LMDeploy Python Backend TIS API'
|
||||
|
||||
res_que = Queue()
|
||||
|
||||
self._call_triton_server(prompt=prompt,
|
||||
tis_addr=self.tis_addr,
|
||||
session_id=threading.currentThread().ident,
|
||||
request_output_len=max_out_len,
|
||||
temperature=temperature,
|
||||
res_que=res_que)
|
||||
text = self._process_result(res_que)
|
||||
response = valid_str(text)
|
||||
if end_str:
|
||||
response = response.split(end_str)[0]
|
||||
return response
|
@ -601,6 +601,10 @@ class OpenAISDK(OpenAI):
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
'Successfully get response from OpenAI API')
|
||||
try:
|
||||
self.logger.info(responses)
|
||||
except Exception as e: # noqa F841
|
||||
pass
|
||||
return responses.choices[0].message.content
|
||||
except Exception as e:
|
||||
self.logger.error(e)
|
||||
|
@ -1,135 +0,0 @@
|
||||
import logging
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from opencompass.models.base import BaseModel, LMTemplateParser
|
||||
from opencompass.utils.logging import get_logger
|
||||
from opencompass.utils.prompt import PromptList
|
||||
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
def valid_str(string, coding='utf-8'):
|
||||
"""decode text according to its encoding type."""
|
||||
invalid_chars = [b'\xef\xbf\xbd']
|
||||
bstr = bytes(string, coding)
|
||||
for invalid_char in invalid_chars:
|
||||
bstr = bstr.replace(invalid_char, b'')
|
||||
ret = bstr.decode(encoding=coding, errors='ignore')
|
||||
return ret
|
||||
|
||||
|
||||
class TurboMindTisModel(BaseModel):
|
||||
"""Model wrapper for TurboMind Triton Inference Server gRPC API.
|
||||
|
||||
Args:
|
||||
path (str): The name of OpenAI's model.
|
||||
tis_addr (str): The address (ip:port format) of turbomind's
|
||||
triton inference server
|
||||
max_seq_len (int): The maximum allowed sequence length of a model.
|
||||
Note that the length of prompt + generated tokens shall not exceed
|
||||
this value. Defaults to 2048.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
"""
|
||||
|
||||
is_api: bool = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
tis_addr: str = '0.0.0.0:33337',
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template)
|
||||
from lmdeploy.serve.turbomind.utils import Preprocessor
|
||||
self.preprocess = Preprocessor(tis_addr)
|
||||
self.logger = get_logger()
|
||||
self.template_parser = LMTemplateParser(meta_template)
|
||||
self.eos_token_id = None
|
||||
if meta_template and 'eos_token_id' in meta_template:
|
||||
self.eos_token_id = meta_template['eos_token_id']
|
||||
self.tis_addr = tis_addr
|
||||
|
||||
def generate(
|
||||
self,
|
||||
inputs: List[PromptType],
|
||||
max_out_len: int = 512,
|
||||
temperature: float = 1.0,
|
||||
) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (List[PromptType]): A list of strings or PromptDicts.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic. Defaults to 0.7.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
results = list(
|
||||
executor.map(self._generate, inputs,
|
||||
[max_out_len] * len(inputs),
|
||||
[temperature] * len(inputs)))
|
||||
return results
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
input_ids, _ = self.preprocess(prompt)
|
||||
return input_ids.shape[-1]
|
||||
|
||||
def wait(self):
|
||||
"""Wait till the next query can be sent.
|
||||
|
||||
Applicable in both single-thread and multi-thread environments.
|
||||
"""
|
||||
return self.token_bucket.get_token()
|
||||
|
||||
def _generate(self, prompt: PromptType, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
prompt (PromptType): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert type(
|
||||
prompt) is str, 'We only support string for TurboMind RPC API'
|
||||
|
||||
from lmdeploy.serve.turbomind.chatbot import Chatbot
|
||||
chatbot = Chatbot(self.tis_addr,
|
||||
temperature=temperature,
|
||||
capability='completion',
|
||||
top_k=1,
|
||||
log_level=logging.ERROR)
|
||||
|
||||
for status, text, n_token in chatbot.stream_infer(
|
||||
session_id=threading.currentThread().ident,
|
||||
prompt=prompt,
|
||||
request_output_len=max_out_len,
|
||||
sequence_start=True,
|
||||
sequence_end=True):
|
||||
continue
|
||||
response = valid_str(text)
|
||||
response = response.replace('<eoa>', '')
|
||||
return response
|
@ -1,7 +1,6 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import copy
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from opencompass.models.base import BaseModel
|
||||
@ -31,38 +30,32 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
self,
|
||||
path: str,
|
||||
tokenizer_only: bool = False,
|
||||
backend: str = 'turbomind',
|
||||
engine_config: Dict = {},
|
||||
gen_config: Dict = {},
|
||||
concurrency: int = 8,
|
||||
max_seq_len: int = None,
|
||||
meta_template: Optional[Dict] = None,
|
||||
fastchat_template: Optional[str] = None,
|
||||
stop_words: List[str] = [],
|
||||
):
|
||||
from lmdeploy.messages import TurbomindEngineConfig
|
||||
from lmdeploy.turbomind import TurboMind
|
||||
from lmdeploy.version import version_info
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
self.logger = get_logger()
|
||||
self.path = path
|
||||
self.tokenizer_only = tokenizer_only
|
||||
self.template_parser = _get_meta_template(meta_template)
|
||||
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
|
||||
|
||||
self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
||||
from lmdeploy import version_info
|
||||
from transformers import AutoTokenizer
|
||||
self.version_info = version_info
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
||||
if not tokenizer_only:
|
||||
DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len}
|
||||
_engine_config = DEFAULT_ENGING_CONFIG.copy()
|
||||
_engine_config.update(engine_config)
|
||||
engine_config = TurbomindEngineConfig(**_engine_config)
|
||||
tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
|
||||
self.tokenizer = tm_model.tokenizer
|
||||
self.generators = [tm_model.create_instance() for i in range(concurrency)]
|
||||
self.generator_ids = [i + 1 for i in range(concurrency)]
|
||||
self.concurrency = concurrency
|
||||
self.pipe = self._build_pipe(path, backend, _engine_config)
|
||||
else:
|
||||
self.pipe = None
|
||||
self.gen_config = gen_config
|
||||
self.version_info = version_info
|
||||
self.fastchat_template = fastchat_template
|
||||
self.stop_words = list(set(stop_words + self._get_potential_stop_words(path)))
|
||||
self.logger.info(f'using stop words: {self.stop_words}')
|
||||
@ -76,23 +69,23 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
generation_config = None
|
||||
if generation_config and hasattr(generation_config, 'eos_token_id'):
|
||||
if isinstance(generation_config.eos_token_id, int):
|
||||
potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id))
|
||||
potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
|
||||
else:
|
||||
assert isinstance(generation_config.eos_token_id, list)
|
||||
for token_id in generation_config.eos_token_id:
|
||||
potential_stop_words.append(self.origin_tokenizer.decode(token_id))
|
||||
if self.origin_tokenizer.eos_token is not None:
|
||||
potential_stop_words.append(self.origin_tokenizer.eos_token)
|
||||
potential_stop_words.append(self.tokenizer.decode(token_id))
|
||||
if self.tokenizer.eos_token is not None:
|
||||
potential_stop_words.append(self.tokenizer.eos_token)
|
||||
potential_stop_words = list(set(potential_stop_words))
|
||||
potential_stop_words = [s for s in potential_stop_words if s]
|
||||
return potential_stop_words
|
||||
|
||||
def generate(self,
|
||||
inputs: List[str],
|
||||
max_out_len: int = 512,
|
||||
max_out_len: int,
|
||||
stopping_criteria: List[str] = [],
|
||||
do_sample: Optional[bool] = None,
|
||||
temperature: int = 1,
|
||||
temperature: float = 1.0,
|
||||
**kwargs) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
@ -104,93 +97,45 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
List[str]: A list of generated strings.
|
||||
"""
|
||||
assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}'
|
||||
|
||||
messages = _convert_chat_messages(inputs)
|
||||
if self.fastchat_template:
|
||||
messages = _format_with_fast_chat_template(messages, self.fastchat_template)
|
||||
else:
|
||||
messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
|
||||
|
||||
# split messages into batches
|
||||
batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
|
||||
messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
|
||||
|
||||
stop_words = list(set(self.stop_words + stopping_criteria))
|
||||
encode_stop_words = []
|
||||
if stop_words is not None and len(stop_words) > 0:
|
||||
for words in stop_words:
|
||||
encode_stop_words += self.tokenizer.encode(words, add_bos=False)
|
||||
|
||||
DEFAULT_GEN_CONFIG = {
|
||||
'max_new_tokens': max_out_len,
|
||||
'min_new_tokens': 1,
|
||||
'top_k': 1,
|
||||
'stop_words': encode_stop_words,
|
||||
'stop_words': stop_words,
|
||||
}
|
||||
|
||||
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
|
||||
gen_config.update(self.gen_config)
|
||||
if do_sample:
|
||||
gen_config['top_k'] = 1000
|
||||
gen_config['top_k'] = 40
|
||||
gen_config['temperature'] = temperature
|
||||
else:
|
||||
if self.version_info >= (0, 6, 0):
|
||||
gen_config['do_sample'] = False
|
||||
else:
|
||||
gen_config['top_k'] = 1
|
||||
|
||||
from lmdeploy.messages import GenerationConfig
|
||||
from lmdeploy import GenerationConfig
|
||||
gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)}
|
||||
gen_config = GenerationConfig(**gen_config)
|
||||
if self.version_info >= (0, 6, 0):
|
||||
gen_config.stop_words = stop_words
|
||||
gen_config.convert_stop_bad_words_to_ids(self.tokenizer)
|
||||
|
||||
results = []
|
||||
for batch_message in batch_messages:
|
||||
n = len(batch_message)
|
||||
with ThreadPoolExecutor() as executor:
|
||||
_results = list(
|
||||
executor.map(
|
||||
self._generate,
|
||||
self.generators[:n],
|
||||
self.generator_ids[:n],
|
||||
batch_message,
|
||||
[gen_config] * n,
|
||||
))
|
||||
results += _results
|
||||
outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
|
||||
for output in outputs:
|
||||
text = self.tokenizer.decode(output.token_ids)
|
||||
results.append(text)
|
||||
|
||||
for s in stop_words:
|
||||
results = [r.split(s)[0] for r in results]
|
||||
return results
|
||||
|
||||
def _generate(self,
|
||||
generator,
|
||||
session_id,
|
||||
prompt: PromptType,
|
||||
gen_config=None) -> str:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
prompt (PromptType): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
gen_config (GenerationConfig, optional): Generation
|
||||
config to set arguments like top_k, top_p, temperature.
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert type(prompt) is str, 'We only support string for TurboMind Python API'
|
||||
|
||||
input_ids = self.tokenizer.encode(prompt, add_bos=False)
|
||||
for outputs in generator.stream_infer(session_id=session_id,
|
||||
input_ids=[input_ids],
|
||||
gen_config=gen_config,
|
||||
sequence_start=True,
|
||||
sequence_end=True,
|
||||
step=0,
|
||||
stream_output=False):
|
||||
if self.version_info >= (0, 4, 0):
|
||||
output_ids = outputs.token_ids
|
||||
else:
|
||||
_, output_ids, _ = outputs
|
||||
response = self.tokenizer.decode(output_ids)
|
||||
response = valid_str(response)
|
||||
return response
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
"""Get lengths of the tokenized strings.
|
||||
|
||||
@ -201,5 +146,20 @@ class TurboMindModelwithChatTemplate(BaseModel):
|
||||
int: Length of the input tokens
|
||||
"""
|
||||
m = _convert_chat_messages([prompt])[0]
|
||||
t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
|
||||
t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
|
||||
return len(t['input_ids'])
|
||||
|
||||
def _build_pipe(self, model_path, backend, engine_config):
|
||||
from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig,
|
||||
pipeline)
|
||||
|
||||
assert backend in ['pytorch', 'turbomind'], \
|
||||
f'unsupported backend type: {backend}'
|
||||
|
||||
if backend == 'turbomind':
|
||||
filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)}
|
||||
backend_config = TurbomindEngineConfig(**filtered)
|
||||
else:
|
||||
filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)}
|
||||
backend_config = PytorchEngineConfig(**filtered)
|
||||
return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10)
|
||||
|
@ -232,6 +232,8 @@ class DLCRunner(BaseRunner):
|
||||
while True:
|
||||
# 1. Avoid to request dlc too frequently.
|
||||
# 2. DLC job may not be ready immediately after creation.
|
||||
dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10)
|
||||
time.sleep(dlc_sleep_time)
|
||||
num_retry = 60
|
||||
for retry_index in range(num_retry):
|
||||
time.sleep(2)
|
||||
|
@ -4,6 +4,7 @@ from .all_obj import AllObjSummarizer
|
||||
from .alpacaeval import AlpacaSummarizer
|
||||
from .arenahard import ArenaHardSummarizer
|
||||
from .charm import CharmMemSummarizer
|
||||
from .common_summarizer import CommonSummarizer
|
||||
from .compass_arena import CompassArenaSummarizer
|
||||
from .compassbench import CompassBenchSummarizer
|
||||
from .corev2 import Corev2Summarizer
|
||||
|
146
opencompass/summarizers/subjective/common_summarizer.py
Normal file
146
opencompass/summarizers/subjective/common_summarizer.py
Normal file
@ -0,0 +1,146 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import csv
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from mmengine import ConfigDict
|
||||
from tabulate import tabulate
|
||||
|
||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||
|
||||
from .compass_arena import CompassArenaSummarizer
|
||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||
|
||||
|
||||
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||
if model.get('summarizer_abbr', None):
|
||||
return model['summarizer_abbr']
|
||||
else:
|
||||
return model_abbr_from_cfg(model)
|
||||
|
||||
def post_process_single_rate(judgement: str):
|
||||
"""Input a string like below:
|
||||
|
||||
xxx[[5]]xxx, and extract the score
|
||||
"""
|
||||
pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
|
||||
matched_result = re.findall(pattern, judgement)
|
||||
if matched_result:
|
||||
score = float(matched_result[0])
|
||||
else:
|
||||
return None
|
||||
return {'score': score}
|
||||
|
||||
|
||||
def get_capability_results(
|
||||
judged_answers,
|
||||
references,
|
||||
fout,
|
||||
fout_flag,
|
||||
model_abbr,
|
||||
judge_model_abbr,
|
||||
dataset_abbr,
|
||||
):
|
||||
capability_ratings = defaultdict(int)
|
||||
capability_counts = defaultdict(int)
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
capability_ratings['total'] += ans['score']
|
||||
capability_counts['total'] += 1
|
||||
capability_ratings[ref['capability']] += ans['score']
|
||||
capability_counts[ref['capability']] += 1
|
||||
|
||||
capability_avg_ratings = defaultdict(float)
|
||||
|
||||
for capability, total_score in capability_ratings.items():
|
||||
s = total_score / capability_counts[capability]
|
||||
s = round(s, 2)
|
||||
capability_avg_ratings[capability] = s
|
||||
columns = list(capability_avg_ratings.keys())
|
||||
columns.insert(0, columns.pop(columns.index('total')))
|
||||
|
||||
if fout_flag == 0:
|
||||
with open(fout, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
writer.writerow(['model', 'judge_model', 'dataset'] + columns)
|
||||
writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
|
||||
else:
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
|
||||
|
||||
|
||||
class CommonSummarizer(CompassArenaSummarizer):
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict, judge_type='single_rate') -> None:
|
||||
self.judge_type = judge_type
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.judge_type = 'single_rate'
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_model_cfgs = self.cfg['judge_models']
|
||||
self.judge_map = {
|
||||
'single_rate': post_process_single_rate
|
||||
}
|
||||
self.judge_function = self.judge_map[self.judge_type]
|
||||
|
||||
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
if self.judge_type == 'pair':
|
||||
return super().summarize()
|
||||
|
||||
# self.judge_type == 'single'
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag = 0
|
||||
output_tmp_file = osp.join(output_dir, 'result.csv')
|
||||
output_file = osp.join(output_dir, 'total_result.csv')
|
||||
for eval_model_cfg in self.eval_model_cfgs:
|
||||
for judge_model_cfg in self.judge_model_cfgs:
|
||||
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
|
||||
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
|
||||
show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg)
|
||||
judge_abbr = model_abbr_from_cfg(judge_model_cfg)
|
||||
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
|
||||
if os.path.isdir(subdir_path):
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
show_dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
|
||||
get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(output_tmp_file, 'r') as f:
|
||||
csv_reader = csv.reader(f)
|
||||
header = next(csv_reader)
|
||||
table = [line for line in csv_reader]
|
||||
|
||||
new_header = [''] + [line[0] for line in table]
|
||||
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
|
||||
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
|
||||
t = tabulate(new_table, headers=new_header)
|
||||
with open(output_file, 'a') as f:
|
||||
f.write(','.join(new_header) + '\n')
|
||||
for line in new_table:
|
||||
f.write(','.join(map(str, line)) + '\n')
|
||||
print(t)
|
||||
print(output_file)
|
@ -9,7 +9,7 @@ from mmengine.config import Config
|
||||
from opencompass.datasets.custom import make_custom_dataset_config
|
||||
from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel,
|
||||
HuggingFaceCausalLM, HuggingFaceChatGLM3,
|
||||
HuggingFacewithChatTemplate, TurboMindModel,
|
||||
HuggingFacewithChatTemplate,
|
||||
TurboMindModelwithChatTemplate,
|
||||
VLLMwithChatTemplate)
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
@ -233,7 +233,7 @@ def change_accelerator(models, accelerator):
|
||||
model_accels = []
|
||||
for model in models:
|
||||
logger.info(f'Transforming {model["abbr"]} to {accelerator}')
|
||||
# change HuggingFace model to VLLM or TurboMindModel
|
||||
# change HuggingFace model to VLLM or LMDeploy
|
||||
if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']:
|
||||
gen_args = dict()
|
||||
if model.get('generation_kwargs') is not None:
|
||||
@ -254,10 +254,10 @@ def change_accelerator(models, accelerator):
|
||||
|
||||
if accelerator == 'lmdeploy':
|
||||
logger.info(f'Transforming {model["abbr"]} to {accelerator}')
|
||||
mod = TurboMindModel
|
||||
mod = TurboMindModelwithChatTemplate
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
|
||||
abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
|
||||
path=model['path'],
|
||||
engine_config=dict(session_len=model['max_seq_len'],
|
||||
max_batch_size=model['batch_size'],
|
||||
@ -270,7 +270,6 @@ def change_accelerator(models, accelerator):
|
||||
max_out_len=model['max_out_len'],
|
||||
max_seq_len=model['max_seq_len'],
|
||||
batch_size=model['batch_size'],
|
||||
concurrency=model['batch_size'],
|
||||
run_cfg=model['run_cfg'],
|
||||
)
|
||||
for item in ['meta_template']:
|
||||
@ -312,7 +311,7 @@ def change_accelerator(models, accelerator):
|
||||
mod = TurboMindModelwithChatTemplate
|
||||
acc_model = dict(
|
||||
type=f'{mod.__module__}.{mod.__name__}',
|
||||
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
|
||||
abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
|
||||
path=model['path'],
|
||||
engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
|
||||
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
|
||||
|
@ -1,6 +1,7 @@
|
||||
# Alpaca-eval
|
||||
alpaca-eval==0.6
|
||||
cn2an
|
||||
dingo-python
|
||||
# Icl topk retriever
|
||||
faiss_gpu==1.7.2
|
||||
# Humaneval, Humaneval X
|
||||
|
@ -23,6 +23,7 @@ python-Levenshtein
|
||||
rank_bm25==0.2.2
|
||||
rapidfuzz
|
||||
requests>=2.31.0
|
||||
retrying
|
||||
rich
|
||||
rouge
|
||||
-e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese
|
||||
|
Loading…
Reference in New Issue
Block a user