Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2024-10-12 11:05:00 +08:00 committed by GitHub
commit 346c06015a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
56 changed files with 2145 additions and 1032 deletions

View File

@ -8,15 +8,17 @@ with read_base():
race_datasets # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
models as hf_deepseek_v2_lite_model # noqa: F401, E501
# read hf models - chat models
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b import \
models as hf_gemma_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b import \
models as hf_gemma2_2b_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b import \
models as hf_gemma2_9b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
@ -31,16 +33,28 @@ with read_base():
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
models as hf_llama2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
models as hf_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
models as hf_qwen2_0_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
models as hf_qwen2_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_7b import \
models as hf_qwen2_7b_model # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \

View File

@ -13,20 +13,32 @@ with read_base():
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
models as hf_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
models as vllm_glm4_9b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
models as hf_gemma2_2b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
models as hf_gemma2_9b_it_model # noqa: F401, E501
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
models as vllm_gemma_7b_it_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
@ -37,14 +49,20 @@ with read_base():
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -57,6 +75,10 @@ with read_base():
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
models as hf_qwen2_1_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
models as hf_qwen2_7b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \

View File

@ -8,29 +8,33 @@ output_path = 'regression_result_daily'
chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
'lmdeploy-api-test'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']

View File

@ -8,6 +8,16 @@ glm-4-9b-chat-hf:
race-middle: 88
race-high: 88
glm-4-9b-chat-turbomind:
gsm8k: 69
race-middle: 82
race-high: 77
glm-4-9b-chat-vllm:
gsm8k: 73
race-middle: 87
race-high: 87
deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf:
race-middle: 62
race-high: 70
deepseek-v2-lite-chat-hf:
gsm8k: 59
race-middle: 82
race-high: 79
deepseek-7b-chat-vllm:
gsm8k: 63
race-middle: 74
@ -33,23 +48,48 @@ gemma-7b-it-hf:
race-middle: 74
race-high: 71
gemma-7b-it-vllm:
gsm8k: 38
race-middle: 75
race-high: 70
gemma2-2b-it-hf:
gsm8k: 62
race-middle: 75
race-high: 67
gemma2-9b-it-hf:
gsm8k: 80
race-middle: 89
race-high: 85
internlm2_5-7b-chat-hf:
gsm8k: 86
race-middle: 92
race-high: 93
internlm2_5-20b-chat-hf:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2_5-7b-chat-turbomind:
gsm8k: 87
race-middle: 92
race-high: 93
internlm2_5-20b-chat-turbomind:
gsm8k: 91
race-middle: 95
race-high: 91
internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
race-high: 83
internlm2-chat-1.8b-sft-turbomind:
gsm8k: 32
gsm8k: 34
race-middle: 81
race-high: 83
@ -68,11 +108,21 @@ internlm2-chat-7b-vllm:
race-middle: 90
race-high: 91
llama-3_1-8b-instruct-hf:
gsm8k: 82
race-middle: 82
race-high: 88
llama-3-8b-instruct-hf:
gsm8k: 77
race-middle: 85
race-high: 87
llama-3_1-8b-instruct-turbomind:
gsm8k: 79
race-middle: 82
race-high: 88
llama-3-8b-instruct-turbomind:
gsm8k: 77
race-middle: 85
@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf:
race-middle: 82
race-high: 78
mistral-7b-instruct-v0.3-hf:
gsm8k: 53
race-middle: 80
race-high: 78
mistral-7b-instruct-v0.2-vllm:
gsm8k: 49
race-middle: 81
@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf:
race-middle: 55
race-high: 50
qwen2-1.5b-instruct-hf:
gsm8k: 63
race-middle: 77
race-high: 86
qwen2-1.5b-instruct-turbomind:
gsm8k: 60
race-middle: 77
@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind:
race-middle: 87
race-high: 89
qwen2-7b-instruct-hf:
gsm8k: 85
race-middle: 87
race-high: 91
qwen1.5-0.5b-chat-vllm:
gsm8k: 5
race-middle: 57
@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf:
race-middle: 35
race-high: 23
deepseek-v2-lite-hf:
gsm8k: 37
race-middle: 56
race-high: 62
deepseek-7b-base-turbomind:
gsm8k: 21
race-middle: 42
@ -173,8 +243,18 @@ gemma-7b-hf:
race-middle: 59
race-high: 66
gemma2-2b-hf:
gsm8k: 33
race-middle: 56
race-high: 58
gemma2-9b-hf:
gsm8k: 70
race-middle: 82
race-high: 84
internlm2_5-7b-hf:
gsm8k: 46
gsm8k: 47
race-middle: 92
race-high: 91
@ -208,6 +288,21 @@ internlm2-base-7b-turbomind:
race-middle: 75
race-high: 81
llama-2-7b-hf:
gsm8k: 17
race-middle: 32
race-high: 38
llama-3-8b-hf:
gsm8k: 48
race-middle: 64
race-high: 70
llama-3.1-8b-turbomind:
gsm8k: 57
race-middle: 67
race-high: 75
llama-3-8b-turbomind:
gsm8k: 52
race-middle: 63
@ -218,6 +313,11 @@ mistral-7b-v0.2-hf:
race-middle: 42
race-high: 60
mistral-7b-v0.3-hf:
gsm8k: 43
race-middle: 42
race-high: 60
mistral-7b-v0.2-vllm:
gsm8k: 45
race-middle: 42
@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf:
race-middle: 78
race-high: 90
qwen2-1.5b-hf:
gsm8k: 58
race-middle: 65
race-high: 78
qwen2-0.5b-hf:
gsm8k: 35
race-middle: 52
race-high: 48
qwen2-7b-hf:
gsm8k: 82
race-middle: 88
race-high: 89
qwen2-1.5b-turbomind:
gsm8k: 57
race-middle: 64

View File

@ -14,9 +14,14 @@ env:
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
HF_DATASETS_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: false
LMDEPLOY_USE_MODELSCOPE: false
HF_HUB_OFFLINE: 1
TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
@ -43,7 +48,11 @@ jobs:
daily_run_test:
needs: build-pypi
runs-on: self-hosted
strategy:
fail-fast: false
matrix:
cuda_env: [dsw_cu11, dsw_cu12]
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 420 #7hours
steps:
@ -53,22 +62,38 @@ jobs:
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}
- name: Prepare - create conda env and install torch
- name: Prepare - create conda env and install torch - cu11
if: ${{matrix.cuda_env == 'dsw_cu11'}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}
pip install opencompass*.whl
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - create conda env and install torch - cu12
if: ${{matrix.cuda_env == 'dsw_cu12'}}
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
pip uninstall torch torchvision torchaudio -y
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
pip list
- name: Prepare - prepare data and hf model
run: |
ln -s ${{env.DATEASET_CACHE_PATH}} data
@ -77,45 +102,45 @@ jobs:
- name: Run chat model test
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run base model test
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Run command testcase
run: |
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda activate ${{env.CONDA_ENV}}
conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
export from_tf=TRUE
python tools/list_configs.py internlm2_5 mmlu
python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse
rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
run: |
rm -rf regression_result_daily
. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
conda env remove -y --name ${{env.CONDA_ENV}}
conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
conda info --envs
notify_to_feishu:

View File

@ -17,7 +17,7 @@ jobs:
python-version: '3.10'
- name: Install pre-commit hook
run: |
pip install pre-commit mmengine
pip install pre-commit==3.8.0 mmengine
pre-commit install
- name: Linting
run: pre-commit run --all-files

View File

@ -51,7 +51,7 @@ jobs:
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result
python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
- name: Get result
run: |
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')

View File

@ -594,7 +594,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
## 🔜 Roadmap
- [x] Subjective Evaluation
- [x] Release CompassAreana
- [x] Release CompassAreana.
- [x] Subjective evaluation.
- [x] Long-context
- [x] Long-context evaluation with extensive datasets.
@ -603,10 +603,10 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
- [ ] Coding evaluation leaderboard.
- [x] Non-python language evaluation service.
- [x] Agent
- [ ] Support various agenet framework.
- [ ] Support various agent frameworks.
- [x] Evaluation of tool use of the LLMs.
- [x] Robustness
- [x] Support various attack method
- [x] Support various attack methods.
## 👷‍♂️ Contributing

View File

@ -0,0 +1,38 @@
from mmengine.config import read_base
from opencompass.models import BailingAPI
from opencompass.partitioners import NaivePartitioner
from opencompass.runners.local_api import LocalAPIRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
from opencompass.configs.summarizers.medium import summarizer
datasets = [
*ceval_datasets,
]
models = [
dict(
path='Bailing-Lite-0830',
token='xxxxxx', # set your key here or in environment variable BAILING_API_KEY
url='https://bailingchat.alipay.com/chat/completions',
type=BailingAPI,
generation_kwargs={},
query_per_second=1,
max_seq_len=4096,
),
]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalAPIRunner,
max_num_workers=2,
concurrent_users=2,
task=dict(type=OpenICLInferTask),
),
)
work_dir = 'outputs/api_bailing/'

View File

@ -0,0 +1,34 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import DingoDataset, DingoEvaluator
dingo_paths = [
'./data/dingo/en_192.csv',
'./data/dingo/zh_170.csv',
]
dingo_datasets = []
for path in dingo_paths:
dingo_reader_cfg = dict(input_columns='input', output_column=None)
dingo_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
dingo_datasets.append(
dict(
abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
type=DingoDataset,
path=path,
reader_cfg=dingo_reader_cfg,
infer_cfg=dingo_infer_cfg,
eval_cfg=dingo_eval_cfg,
))
datasets = dingo_datasets

View File

@ -15,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_dataset = []
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
followbench_llmeval_dataset.append(
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,

View File

@ -0,0 +1,73 @@
import copy
from opencompass.datasets import WikiBenchDataset
from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
single_choice_prompts = {
'single_choice_cn': [
dict(role='HUMAN',
prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
dict(role='BOT', prompt='回答: C'),
dict(
role='HUMAN',
prompt='问题: 星期五广场荷兰语Vrijdagmarkt荷兰语发音 )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
),
dict(role='BOT', prompt='回答: B'),
dict(
role='HUMAN',
prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
),
dict(role='BOT', prompt='回答: C'),
dict(
role='HUMAN',
prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
),
dict(role='BOT', prompt='回答: B'),
dict(role='HUMAN',
prompt='问题: 丹徒县在1928年改名为什么\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
dict(role='BOT', prompt='回答: C'),
dict(role='HUMAN', prompt='问题: {question}'),
dict(role='BOT', prompt='回答: {answer}'),
]
}
wikibench_sets = {
'wiki': ['single_choice_cn'],
}
do_circular = True
wikibench_datasets = []
for _split in list(wikibench_sets.keys()):
for _name in wikibench_sets[_split]:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = copy.deepcopy(single_choice_prompts[_name])
one_template_round[-1]['prompt'] = one_template_round[-1][
'prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
wikibench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
wikibench_eval_cfg = dict(evaluator=dict(
type=CircularEvaluator if do_circular else AccEvaluator), )
wikibench_datasets.append(
dict(
type=WikiBenchDataset,
path=f'./data/WikiBench/{_name}.jsonl',
name='circular_' + _name if do_circular else _name,
abbr='wikibench-' + _split + '-' + _name +
'circular' if do_circular else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer',
),
infer_cfg=wikibench_infer_cfg,
eval_cfg=wikibench_eval_cfg,
))

View File

@ -0,0 +1,188 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
mmlu_pro_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
cmmlu_datasets
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
# ## Math
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
gpqa_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets
# TODO: Add LiveCodeBench
# ## Instruction Following
# from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups
# Model List
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name': 'core_average',
'subsets': [
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'naive_average'],
['hellaswag', 'accuracy'],
['drop', 'accuracy'],
['math', 'accuracy'],
['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
['mathbench-t (average)', 'naive_average']
],
},
]
summarizer = dict(
dataset_abbrs=[
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'naive_average'],
['hellaswag', 'accuracy'],
['drop', 'accuracy'],
['math', 'accuracy'],
['gsm8k', 'accuracy'],
['mathbench-t (average)', 'naive_average'],
['GPQA_diamond', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['IFEval', 'Prompt-level-strict-accuracy'],
['sanitized_mbpp', 'score'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math','accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science','accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'base_objective')

View File

@ -0,0 +1,220 @@
from mmengine.config import read_base
import os.path as osp
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets
# ## Math
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets
# TODO: Add LiveCodeBench
# ## Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name': 'core_average',
'subsets': [
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'],
['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'],
['hellaswag', 'accuracy'],
['mathbench-t (average)', 'naive_average']
],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'],
['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'],
['hellaswag', 'accuracy'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math','accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science','accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
'',
['bbh', 'extract_rate'],
['math', 'extract_rate'],
# ['openai_humaneval', 'extract_rate'],
['GPQA_diamond', 'extract_rate'],
# ['IFEval', 'extract_rate'],
'',
['mmlu', 'extract_rate'],
['mmlu-stem', 'extract_rate'],
['mmlu-social-science', 'extract_rate'],
['mmlu-humanities', 'extract_rate'],
['mmlu-other', 'extract_rate'],
'',
['mmlu_pro', 'extract_rate'],
['mmlu_pro_math', 'extract_rate'],
['mmlu_pro_physics', 'extract_rate'],
['mmlu_pro_chemistry', 'extract_rate'],
['mmlu_pro_law', 'extract_rate'],
['mmlu_pro_engineering', 'extract_rate'],
['mmlu_pro_other', 'extract_rate'],
['mmlu_pro_economics', 'extract_rate'],
['mmlu_pro_health', 'extract_rate'],
['mmlu_pro_psychology', 'extract_rate'],
['mmlu_pro_business', 'extract_rate'],
['mmlu_pro_biology', 'extract_rate'],
['mmlu_pro_philosophy', 'extract_rate'],
['mmlu_pro_computer_science', 'extract_rate'],
['mmlu_pro_history', 'extract_rate'],
'',
['cmmlu', 'extract_rate'],
['cmmlu-stem', 'extract_rate'],
['cmmlu-social-science', 'extract_rate'],
['cmmlu-humanities', 'extract_rate'],
['cmmlu-other', 'extract_rate'],
['cmmlu-china-specific', 'extract_rate'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'chat_objective')

View File

@ -0,0 +1,138 @@
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets
from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
needlebench_datasets as needlebench_8k_datasets
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
needlebench_datasets as needlebench_32k_datasets
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
needlebench_datasets as needlebench_128k_datasets
from opencompass.configs.datasets.ruler.ruler_8k_gen import \
ruler_datasets as ruler_8k_datasets
from opencompass.configs.datasets.ruler.ruler_32k_gen import \
ruler_datasets as ruler_32k_datasets
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
ruler_datasets as ruler_128k_datasets
# Summary Groups
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups
from opencompass.configs.summarizers.needlebench import (
needlebench_8k_summarizer, needlebench_32k_summarizer,
needlebench_128k_summarizer)
# Instruct models
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_1m_chat_model
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as llama3_1_8b_instruct_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
# Instruct models summarizer
summarizer = dict(
dataset_abbrs=[
['ruler_8k', 'naive_average'],
['ruler_32k', 'naive_average'],
['ruler_128k', 'naive_average'],
['NeedleBench-Overall-Score-8K', 'weighted_average'],
['NeedleBench-Overall-Score-32K', 'weighted_average'],
['NeedleBench-Overall-Score-128K', 'weighted_average'],
['longbench', 'naive_average'],
['longbench_zh', 'naive_average'],
['longbench_en', 'naive_average'],
'',
'longbench_single-document-qa',
'longbench_multi-document-qa',
'longbench_summarization',
'longbench_few-shot-learning',
'longbench_synthetic-tasks',
'longbench_code-completion',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'long_context')

View File

@ -0,0 +1,134 @@
import os.path as osp
from copy import deepcopy
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.models.openai_api import OpenAI, OpenAISDK
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.summarizers import SubjectiveSummarizer
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
arenahard_datasets
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
alignbench_datasets
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
mtbench_datasets
# Summarizer
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
#######################################################################
# PART 3 Models List #
#######################################################################
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2_5-7b-chat-turbomind',
path='internlm/internlm2_5-7b-chat',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096),
max_seq_len=16384,
max_out_len=4096,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]
models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)
),
)
# JudgeLLM
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
])
judge_models = [
dict(
type=OpenAISDK,
abbr='gpt-4o-2024-08-06',
path='gpt-4o-2024-08-06',
# openai_api_base=
# 'http://10.140.1.86:10001/v1', # Change to your own url if needed.
key='YOUR_API_KEY',
retry=10,
meta_template=api_meta_template,
rpm_verbose=True,
query_per_second=1,
max_out_len=4096,
max_seq_len=16384,
batch_size=16,
temperature=0.01,
tokenizer_path='gpt-4o-2024-08-06'
)
]
# Evaluation with local runner
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
models=models,
judge_models=judge_models,
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench/'
work_dir = osp.join(base_exp_dir, 'chat_subjective')

7
configs/eval_dingo.py Normal file
View File

@ -0,0 +1,7 @@
from mmengine.config import read_base
with read_base():
from .models.hf_internlm.hf_internlm_7b import models
from .datasets.dingo.dingo_gen import datasets
work_dir = './outputs/eval_dingo'

View File

@ -1,69 +0,0 @@
from mmengine.config import read_base
from opencompass.models import LmdeployPytorchModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
# config for internlm-chat-7b
internlm_chat_7b = dict(
type=LmdeployPytorchModel,
abbr='internlm-chat-7b-pytorch',
path='internlm/internlm-chat-7b',
engine_config=dict(session_len=2048,
max_batch_size=16),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=16,
concurrency=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
# config for internlm-chat-20b
internlm_chat_20b = dict(
type=LmdeployPytorchModel,
abbr='internlm-chat-20b-pytorch',
path='internlm/internlm-chat-20b',
engine_config=dict(session_len=2048,
max_batch_size=8),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
models = [internlm_chat_20b]

View File

@ -1,41 +0,0 @@
from mmengine.config import read_base
from opencompass.models.lmdeploy_tis import LmdeployTisModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
],
eos_token_id=92542
)
models = [
dict(
type=LmdeployTisModel,
abbr='internlm-chat-20b-lmdeploy-tis',
path='internlm/internlm-chat-20b',
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -1,40 +0,0 @@
from mmengine.config import read_base
from opencompass.models.turbomind_tis import TurboMindTisModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
eos_token_id=103028)
models = [
dict(
type=TurboMindTisModel,
abbr='internlm-chat-20b-turbomind',
path='internlm',
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -1,28 +0,0 @@
from mmengine.config import read_base
from opencompass.models.turbomind_tis import TurboMindTisModel
with read_base():
# choose a list of datasets
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# and output the results in a choosen format
from opencompass.configs.summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
models = [
dict(
type=TurboMindTisModel,
abbr='internlm-chat-20b-turbomind',
path='internlm',
tis_addr='0.0.0.0:33337',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,30 @@
from opencompass.models import BailingAPI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=False),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
path='Bailing-Lite-0830',
token='', # set your key here or in environment variable BAILING_API_KEY
url='https://bailingchat.alipay.com/chat/completions',
type=BailingAPI,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=4096,
batch_size=1,
generation_kwargs={
'temperature': 0.4,
'top_p': 1.0,
'top_k': -1,
'n': 1,
'logprobs': 1,
'use_beam_search': False,
},
),
]

View File

@ -0,0 +1,30 @@
from opencompass.models import BailingAPI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=False),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
path='Bailing-Pro-0920',
token='', # set your key here or in environment variable BAILING_API_KEY
url='https://bailingchat.alipay.com/chat/completions',
type=BailingAPI,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=4096,
batch_size=1,
generation_kwargs={
'temperature': 0.4,
'top_p': 1.0,
'top_k': -1,
'n': 1,
'logprobs': 1,
'use_beam_search': False,
},
),
]

View File

@ -1,15 +1,24 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2-chat-7b-turbomind',
abbr=f'internlm2-chat-7b-lmdeploy',
path='internlm/internlm2-chat-7b',
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
# If the model is not supported by 'turbomind', it will fallback to
# 'pytorch'
backend='turbomind',
# For the detailed engine config and generation config, please refer to
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
engine_config=dict(tp=1),
gen_config=dict(do_sample=False),
max_seq_len=8192,
max_out_len=4096,
batch_size=16,
# the max number of prompts that LMDeploy receives
# in `generate` function
batch_size=5000,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-1.5b-turbomind',
path='Qwen/Qwen2.5-1.5B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-7b-turbomind',
path='Qwen/Qwen2.5-7B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,88 @@
# Evaluation with LMDeploy
We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
## Setup
### Install OpenCompass
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
### Install LMDeploy
Install lmdeploy via pip (python 3.8+)
```shell
pip install lmdeploy
```
The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
```shell
export LMDEPLOY_VERSION=0.6.0
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
## Evaluation
When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
```python
# configure the dataset
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
# and output the results in a chosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# configure lmdeploy
from opencompass.models import TurboMindModelwithChatTemplate
# configure the model
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr=f'internlm2-chat-7b-lmdeploy',
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
path='internlm/internlm2-chat-7b',
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
# If the model is not supported by 'turbomind', it will fallback to
# 'pytorch'
backend='turbomind',
# For the detailed engine config and generation config, please refer to
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
engine_config=dict(tp=1),
gen_config=dict(do_sample=False),
# the max size of the context window
max_seq_len=7168,
# the max number of new tokens
max_out_len=1024,
# the max number of prompts that LMDeploy receives
# in `generate` function
batch_size=5000,
run_cfg=dict(num_gpus=1),
)
]
```
Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
```shell
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
```
You are expected to get the evaluation results after the inference and evaluation.

View File

@ -1,78 +0,0 @@
# Evaluation with LMDeploy
We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass.
## Setup
### Install OpenCompass
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
### Install LMDeploy
Install lmdeploy via pip (python 3.8+)
```shell
pip install lmdeploy
```
## Evaluation
OpenCompass integrates turbomind's python API for evaluation.
We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`:
```python
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# and output the results in a chosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# config for internlm-20b model
internlm_20b = dict(
type=TurboMindModel,
abbr='internlm-20b-turbomind',
path="internlm/internlm-20b", # this path should be same as in huggingface
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>'
)
models = [internlm_20b]
```
Then, in the home folder of OpenCompass, start evaluation by the following command:
```shell
python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
```
You are expected to get the evaluation results after the inference and evaluation.
**Note**:
- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.

View File

@ -0,0 +1,86 @@
# 使用 LMDeploy 加速评测
我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案,拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。
## 环境配置
### 安装 OpenCompass
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
### 安装 LMDeploy
使用 pip 安装 LMDeploy (python 3.8+)
```shell
pip install lmdeploy
```
LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy请执行以下命令
```shell
export LMDEPLOY_VERSION=0.6.0
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
## 评测
在评测一个模型时,需要准备一份评测配置,指明评测集、模型和推理参数等信息。
以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,相关的配置信息如下:
```python
# configure the dataset
from mmengine.config import read_base
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
# and output the results in a chosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# configure lmdeploy
from opencompass.models import TurboMindModelwithChatTemplate
# configure the model
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr=f'internlm2-chat-7b-lmdeploy',
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
path='internlm/internlm2-chat-7b',
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
# If the model is not supported by 'turbomind', it will fallback to
# 'pytorch'
backend='turbomind',
# For the detailed engine config and generation config, please refer to
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
engine_config=dict(tp=1),
gen_config=dict(do_sample=False),
# the max size of the context window
max_seq_len=7168,
# the max number of new tokens
max_out_len=1024,
# the max number of prompts that LMDeploy receives
# in `generate` function
batch_size=32,
run_cfg=dict(num_gpus=1),
)
]
```
把上述配置放在文件中,比如 "configs/eval_internlm2_lmdeploy.py"。然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
```shell
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
```

View File

@ -1,75 +0,0 @@
# 评测 LMDeploy 模型
我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配,本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。
## 环境配置
### 安装 OpenCompass
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
### 安装 LMDeploy
使用 pip 安装 LMDeploy (python 3.8+)
```shell
pip install lmdeploy
```
## 评测
OpenCompass 支持分别通过 turbomind python API 评测数据集。
下文以 InternLM-20B 模型为例,介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`:
```python
from mmengine.config import read_base
from opencompass.models.turbomind import TurboMindModel
with read_base():
# choose a list of datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
# and output the results in a chosen format
from .summarizers.medium import summarizer
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
# config for internlm-20b model
internlm_20b = dict(
type=TurboMindModel,
abbr='internlm-20b-turbomind',
path="internlm/internlm-20b", # 注意路径与huggingface保持一致
engine_config=dict(session_len=2048,
max_batch_size=8,
rope_scaling_factor=1.0),
gen_config=dict(top_k=1, top_p=0.8,
temperature=1.0,
max_new_tokens=100),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>'
)
models = [internlm_20b]
```
然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
```shell
python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
```
**注:**
- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py`
- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。

View File

@ -1 +1 @@
__version__ = '0.3.2.post1'
__version__ = '0.3.3'

View File

@ -0,0 +1,34 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import DingoDataset, DingoEvaluator
dingo_paths = [
'./data/dingo/en_192.csv',
'./data/dingo/zh_170.csv',
]
dingo_datasets = []
for path in dingo_paths:
dingo_reader_cfg = dict(input_columns='input', output_column=None)
dingo_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
dingo_datasets.append(
dict(
abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
type=DingoDataset,
path=path,
reader_cfg=dingo_reader_cfg,
infer_cfg=dingo_infer_cfg,
eval_cfg=dingo_eval_cfg,
))
datasets = dingo_datasets

View File

@ -15,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_dataset = []
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
followbench_llmeval_dataset.append(
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,

View File

@ -0,0 +1,73 @@
import copy
from opencompass.datasets import WikiBenchDataset
from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
single_choice_prompts = {
'single_choice_cn': [
dict(role='HUMAN',
prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
dict(role='BOT', prompt='回答: C'),
dict(
role='HUMAN',
prompt='问题: 星期五广场荷兰语Vrijdagmarkt荷兰语发音 )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
),
dict(role='BOT', prompt='回答: B'),
dict(
role='HUMAN',
prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
),
dict(role='BOT', prompt='回答: C'),
dict(
role='HUMAN',
prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
),
dict(role='BOT', prompt='回答: B'),
dict(role='HUMAN',
prompt='问题: 丹徒县在1928年改名为什么\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
dict(role='BOT', prompt='回答: C'),
dict(role='HUMAN', prompt='问题: {question}'),
dict(role='BOT', prompt='回答: {answer}'),
]
}
wikibench_sets = {
'wiki': ['single_choice_cn'],
}
do_circular = True
wikibench_datasets = []
for _split in list(wikibench_sets.keys()):
for _name in wikibench_sets[_split]:
template = {}
for answer in ['A', 'B', 'C', 'D']:
one_template_round = copy.deepcopy(single_choice_prompts[_name])
one_template_round[-1]['prompt'] = one_template_round[-1][
'prompt'].format(answer=answer)
template[answer] = dict(round=one_template_round)
wikibench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=template),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
wikibench_eval_cfg = dict(evaluator=dict(
type=CircularEvaluator if do_circular else AccEvaluator), )
wikibench_datasets.append(
dict(
type=WikiBenchDataset,
path=f'./data/WikiBench/{_name}.jsonl',
name='circular_' + _name if do_circular else _name,
abbr='wikibench-' + _split + '-' + _name +
'circular' if do_circular else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer',
),
infer_cfg=wikibench_infer_cfg,
eval_cfg=wikibench_eval_cfg,
))

View File

@ -0,0 +1,30 @@
from opencompass.models import BailingAPI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=False),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
path='Bailing-Lite-0830',
token='', # set your key here or in environment variable BAILING_API_KEY
url='https://bailingchat.alipay.com/chat/completions',
type=BailingAPI,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=4096,
batch_size=1,
generation_kwargs={
'temperature': 0.4,
'top_p': 1.0,
'top_k': -1,
'n': 1,
'logprobs': 1,
'use_beam_search': False,
},
),
]

View File

@ -0,0 +1,30 @@
from opencompass.models import BailingAPI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=False),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
models = [
dict(
path='Bailing-Pro-0920',
token='', # set your key here or in environment variable BAILING_API_KEY
url='https://bailingchat.alipay.com/chat/completions',
type=BailingAPI,
meta_template=api_meta_template,
query_per_second=1,
max_seq_len=4096,
batch_size=1,
generation_kwargs={
'temperature': 0.4,
'top_p': 1.0,
'top_k': -1,
'n': 1,
'logprobs': 1,
'use_beam_search': False,
},
),
]

View File

@ -1,15 +1,24 @@
from opencompass.models import TurboMindModelwithChatTemplate
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='internlm2-chat-7b-turbomind',
abbr=f'internlm2-chat-7b-lmdeploy',
path='internlm/internlm2-chat-7b',
engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
# If the model is not supported by 'turbomind', it will fallback to
# 'pytorch'
backend='turbomind',
# For the detailed engine config and generation config, please refer to
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
engine_config=dict(tp=1),
gen_config=dict(do_sample=False),
max_seq_len=8192,
max_out_len=4096,
batch_size=16,
# the max number of prompts that LMDeploy receives
# in `generate` function
batch_size=5000,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-1.5b-turbomind',
path='Qwen/Qwen2.5-1.5B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,15 @@
from opencompass.models import TurboMindModel
models = [
dict(
type=TurboMindModel,
abbr='qwen2.5-7b-turbomind',
path='Qwen/Qwen2.5-7B',
engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
max_seq_len=7168,
max_out_len=1024,
batch_size=16,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
data = get_data_path(path, local_mode=True)
path = get_data_path(path, local_mode=True)
if environ.get('DATASET_SOURCE') == 'ModelScope':
from modelscope import MsDataset
return MsDataset.load(path, subset_name=name, split='test')

View File

@ -33,6 +33,7 @@ from .crowspairs_cn import * # noqa: F401, F403
from .csl import * # noqa: F401, F403
from .custom import * # noqa: F401, F403
from .cvalues import * # noqa: F401, F403
from .dingo import * # noqa: F401, F403
from .drcd import * # noqa: F401, F403
from .drop import * # noqa: F401, F403
from .drop_simple_eval import * # noqa: F401, F403

View File

@ -0,0 +1,84 @@
# flake8: nodingo
# yapf: disable
import csv
import json
import os
import time
from typing import List
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class DingoDataset(BaseDataset):
@staticmethod
def load(path: str):
raw_data = []
with open(path, encoding='utf-8') as f:
reader = csv.reader(f, delimiter=';')
for row in reader:
if len(row) < 1:
row = ['']
raw_data.append({'input': row[0]})
return Dataset.from_list(raw_data)
@LOAD_DATASET.register_module()
class DingoLongDataset(BaseDataset):
@staticmethod
def load(path: str):
raw_data = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
raw_data.append({'input': json.loads(line).get('input')})
return Dataset.from_list(raw_data)
@ICL_EVALUATORS.register_module()
class DingoEvaluator(BaseEvaluator):
def score(self, origin_prompt: List, predictions: List) -> dict:
try:
# from dingo.model.model import Model
from dingo.exec import Executor
from dingo.io import InputArgs
except Exception:
raise ModuleNotFoundError(
'=========== '
'dingo register fail. please try: pip install dingo-python.'
' ===========')
current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
file_data = [{'prompt': pmt, 'prediction': prd}
for pmt, prd in zip(origin_prompt, predictions)]
file_name = 'dingo_file_' + current_time + '.jsonl'
with open(file_name, 'a', encoding='utf-8') as f:
for d in file_data:
json.dump(d, f, ensure_ascii=False)
f.write('\n')
input_data = {
'eval_models': ['llm_base'],
'input_path': file_name,
'output_path': './outputs/dingo/',
'dataset': 'local',
'datasource': 'local',
'data_format': 'jsonl',
'column_prompt': ['prompt'],
'column_content': ['prediction'],
}
# Model.apply_config(input_data["custom_config_path"])
input_args = InputArgs(**input_data)
executor = Executor.exec_map['local'](input_args)
result = executor.execute()
summary = result[0].to_dict()
os.remove(file_name)
return summary

View File

@ -3,6 +3,7 @@ from .ai360_api import AI360GPT # noqa: F401
from .alaya import AlayaLM # noqa: F401
from .baichuan_api import BaiChuan # noqa: F401
from .baidu_api import ERNIEBot # noqa: F401
from .bailing_api_oc import BailingAPI # noqa: F401
from .base import BaseModel, LMTemplateParser # noqa: F401
from .base_api import APITemplateParser, BaseAPIModel # noqa: F401
from .bytedance_api import ByteDance # noqa: F401
@ -24,8 +25,6 @@ from .interntrain import InternTrain # noqa: F401
from .krgpt_api import KrGPT # noqa: F401
from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
from .lmdeploy_tis import LmdeployTisModel # noqa: F401
from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401
from .mistral_api import Mistral # noqa: F401
from .mixtral import Mixtral # noqa: F401
@ -40,7 +39,6 @@ from .rendu_api import Rendu # noqa: F401
from .sensetime_api import SenseTime # noqa: F401
from .stepfun_api import StepFun # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
from .turbomind_tis import TurboMindTisModel # noqa: F401
from .turbomind_with_tf_above_v4_33 import \
TurboMindModelwithChatTemplate # noqa: F401
from .unigpt_api import UniGPT # noqa: F401

View File

@ -0,0 +1,225 @@
import concurrent
import concurrent.futures
import os
import socket
import traceback
from typing import Dict, List, Optional, Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.connection import HTTPConnection
try:
from retrying import retry
except ImportError:
retry = None
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
class HTTPAdapterWithSocketOptions(HTTPAdapter):
def __init__(self, *args, **kwargs):
self._socket_options = HTTPConnection.default_socket_options + [
(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
(socket.SOL_TCP, socket.TCP_KEEPIDLE, 75),
(socket.SOL_TCP, socket.TCP_KEEPINTVL, 30),
(socket.SOL_TCP, socket.TCP_KEEPCNT, 120),
]
super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs)
def init_poolmanager(self, *args, **kwargs):
if self._socket_options is not None:
kwargs['socket_options'] = self._socket_options
super(HTTPAdapterWithSocketOptions,
self).init_poolmanager(*args, **kwargs)
class BailingAPI(BaseAPIModel):
"""Model wrapper around Bailing Service.
Args:
ouput_key (str): key for prediction
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
generation_kwargs: other params
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
token: str,
url: str,
meta_template: Optional[Dict] = None,
query_per_second: int = 1,
retry: int = 3,
generation_kwargs: Dict = {},
max_seq_len=4096,
):
super().__init__(
path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry,
generation_kwargs=generation_kwargs,
)
self.logger.info(f'Bailing API Model Init path: {path} url={url}')
if not token:
token = os.environ.get('BAILING_API_KEY')
if token:
self._headers = {'Authorization': f'Bearer {token}'}
else:
raise RuntimeError('There is not valid token.')
else:
self._headers = {'Authorization': f'Bearer {token}'}
self._headers['Content-Type'] = 'application/json'
self._url = url if url else \
'https://bailingchat.alipay.com/chat/completions'
self._model = path
self._sessions = []
self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
if os.environ.get('BAILING_API_PARALLEL_NUM') else 1)
try:
for _ in range(self._num):
adapter = HTTPAdapterWithSocketOptions()
sess = requests.Session()
sess.mount('http://', adapter)
sess.mount('https://', adapter)
self._sessions.append(sess)
except Exception as e:
self.logger.error(f'Fail to setup the session. {e}')
raise e
def generate(
self,
inputs: Union[List[str], PromptList],
max_out_len: int = 4096,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (Union[List[str], PromptList]):
A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass' API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with concurrent.futures.ThreadPoolExecutor(
max_workers=self._num, ) as executor:
future_to_m = {
executor.submit(
self._generate,
self._sessions[i % self._num],
input,
max_out_len,
): i
for i, input in enumerate(inputs)
}
results = []
for future in concurrent.futures.as_completed(future_to_m):
m = future_to_m[future] # noqa F841
resp = future.result()
if resp and resp.status_code == 200:
try:
result = resp.json()
except Exception as e: # noqa F841
results.append('')
else:
if (result.get('choices')
and result['choices'][0].get('message')
and result['choices'][0]['message'].get(
'content')):
results.append(
result['choices'][0]['message']['content'])
else:
results.append('')
self.flush()
return results
def _generate(
self,
sess,
input: Union[str, PromptList],
max_out_len: int,
) -> str:
"""Generate results given an input.
Args:
inputs (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass' API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
content = item['prompt']
if not content:
continue
message = {'content': content}
if item['role'] == 'HUMAN':
message['role'] = 'user'
elif item['role'] == 'BOT':
message['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
message['role'] = 'system'
else:
message['role'] = item['role']
messages.append(message)
request = {
'model':
self._model,
'messages':
messages,
'max_seq_len':
max(
max_out_len if max_out_len else 4096,
self.max_seq_len if self.max_seq_len else 4096,
),
}
request.update(self.generation_kwargs)
try:
retry_num = 0
while retry_num < self.retry:
response = self._infer_result(request, sess)
if response.status_code == 200:
break # success
elif response.status_code == 426:
retry_num += 1 # retry
else:
raise ValueError(f'Status code = {response.status_code}')
else:
raise ValueError(
f'Exceed the maximal retry times. Last status code '
f'= {response.status_code}')
except Exception as e:
self.logger.error(f'Fail to inference request={request}; '
f'model_name={self.path}; error={e}, '
f'stack:{traceback.format_exc()}')
raise e
return response
# @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms
def _infer_result(self, request, sess):
response = sess.request(
'POST',
self._url,
json=request,
headers=self._headers,
timeout=500,
)
return response

View File

@ -79,6 +79,50 @@ class LegacyInternTrainManager(InternTrainManager):
@MODELS.register_module()
class InternTrain(BaseModel):
"""Model wrapper for InternTrain.
Args:
path (str): The name or path to HuggingFace's model.
module_path (str): Path of InternTrain repository.
max_seq_len (int): The maximum length of the input sequence. Defaults
to 2048.
tokenizer_only (bool): If True, only the tokenizer will be initialized.
Defaults to False.
tokenizer_path (str): The path to the tokenizer. Defaults to None.
tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'.
model_config (str, dict, optional): Config of model. There are several
options for this parameter:
- filename (str): The config items are defined in a python file
so the model will load configs from this file.
- config (dict): The configuration items are defined in a dict
and the model will be initialized from ```model_config```.
- None: The config is loaded from ```path```. In this case,
please make sure that ```path``` contains a config file named
``model_config.pt``.
Defaults to None.
model_type: Type of model. Defaults to 'InternTrain'
ckpt_type: The type of load function in InternTrain when checkpoints
are loaded. Defaults to None, which means load the checkpoint
directlywith pipeline merged.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
model_dtype: The model's dtype. If None, will use dtype defined in
```model_config```. Defaults to None.
generation_kwargs (Dict, optional): The generation kwargs for the
model. Defaults to dict().
sync_rank (bool): Whether to sync inputs between ranks. Do not use this
if you are not familiar with this behavior. Check `sync_inputs`
function for more details. Defaults to False.
mode (str, optional): The method of input truncation when input length
exceeds max_seq_len. 'mid' represents the part of input to
truncate. Defaults to 'none'.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
"""
def __init__(self,
path: str,
@ -87,14 +131,15 @@ class InternTrain(BaseModel):
tokenizer_only: bool = False,
tokenizer_path: Optional[str] = None,
tokenizer_type: str = 'INTERNLM',
model_config: Optional[str] = None,
model_config: Optional[Union[str, Dict]] = None,
model_type: str = 'INTERNLM2',
ckpt_type: Optional[str] = None,
meta_template: Optional[Dict] = None,
model_dtype: Optional[str] = None,
generation_kwargs={},
sync_rank: bool = False,
mode='none'):
mode='none',
end_str: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
tokenizer_only=tokenizer_only,
@ -146,6 +191,7 @@ class InternTrain(BaseModel):
bos_token_id=self.tokenizer.bos_id,
pad_token_id=self.tokenizer.bos_id,
eos_token_id=eos_token_ids)
self.end_str = end_str
def _load_model(self,
path: str,
@ -242,7 +288,7 @@ class InternTrain(BaseModel):
else:
raise NotImplementedError(f'Unknown model dtype {model_dtype}')
def get_token_len(self, prompt: str) -> int:
def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int:
"""Get lengths of the tokenized strings.
Args:
@ -251,7 +297,7 @@ class InternTrain(BaseModel):
Returns:
int: Length of the input tokens
"""
tokens = self.tokenizer(prompt, use_bos=True, use_eos=True)
tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos)
return len(tokens)
def generate(self,
@ -287,8 +333,10 @@ class InternTrain(BaseModel):
max_length=tokens.shape[1] + max_out_len,
**self.generation_kwargs) # bsz, num_return_sequences, max_length
outputs = outputs[:, 0, tokens.shape[1]:]
output_text = self.batch_decode(outputs,
stopping_criteria=stopping_criteria)
output_text = self.batch_decode(
outputs,
eos_token_ids=self.generator.eos_token_id,
stopping_criteria=stopping_criteria)
return output_text
@ -343,7 +391,7 @@ class InternTrain(BaseModel):
for input_text, cont in zip(input_texts, conts)
]
replaced_lens = [
len(self.encode(input_text)[0]) for input_text in replaced_texts
self.get_token_len(input_text) for input_text in replaced_texts
]
loglikelihoods = []
for nloss, nlen, rlen in zip(loss, lens, replaced_lens):
@ -407,11 +455,22 @@ class InternTrain(BaseModel):
return torch.LongTensor(tokens).cuda()
def batch_decode(self, outputs, stopping_criteria: List[str] = []):
def batch_decode(self,
outputs,
eos_token_ids: List[int],
stopping_criteria: List[str] = []):
# outputs: bsz, seq_len
output_text = []
outputs = outputs.tolist()
for output in outputs:
text = self.tokenizer.decode(output.tolist())
# cut off by eos_token_ids
eos_idx = len(output)
for eos_id in eos_token_ids:
if eos_id in output:
eos_idx = min(output.index(eos_id), eos_idx)
text = self.tokenizer.decode(output[:eos_idx])
if self.end_str is not None:
text = text.split(self.end_str)[0]
for stop_word in stopping_criteria:
text = text.split(stop_word)[0]
output_text.append(text)

View File

@ -1,188 +0,0 @@
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret
class LmdeployPytorchModel(BaseModel):
"""Model wrapper for lmdeploy pytorch engine through python API.
Args:
path (str): path of the supported pytorch model.
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
engine_config (Dict, optional): The engine config to set
arguments like session_len, max_batch_size for TurboMind.
gen_config (Dict, optional): Generation config to set
arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
"""
def __init__(self,
path: str,
concurrency: int = 8,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
engine_config: Optional[Dict] = None,
gen_config: Optional[Dict] = None,
end_str: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template)
from lmdeploy.pytorch import engine as tm
from lmdeploy.version import version_info
if engine_config is not None:
from lmdeploy.messages import PytorchEngineConfig
engine_config = PytorchEngineConfig(**engine_config)
# set thread_safe
if hasattr(engine_config, 'thread_safe'):
engine_config.thread_safe = True
if gen_config is not None:
from lmdeploy.messages import GenerationConfig
gen_config = GenerationConfig(**gen_config)
self.logger = get_logger()
tm_model = tm.Engine(path, engine_config)
self.tokenizer = tm_model.tokenizer
self.generators = [
tm_model.create_instance() for i in range(concurrency)
]
self.generator_ids = [i + 1 for i in range(concurrency)]
from transformers import GenerationConfig
try:
generation_config = GenerationConfig.from_pretrained(path)
except Exception:
generation_config = None
if generation_config and hasattr(generation_config, 'eos_token_id'):
if gen_config.stop_words is None:
stop_words = []
if isinstance(generation_config.eos_token_id, int):
stop_words.append(generation_config.eos_token_id)
else:
assert isinstance(generation_config.eos_token_id, list)
for token_id in generation_config.eos_token_id:
stop_words.append(token_id)
gen_config.stop_words = stop_words
if version_info >= (0, 6, 0):
gen_config.stop_token_ids = stop_words
self.gen_config = gen_config
self.end_str = end_str
self.major_version, self.minor_version = version_info[:2]
def generate(
self,
inputs: List[str],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of prompts
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
assert isinstance(
inputs, List), f'List(str) is expected, but got {type(inputs)}'
# split inputs into batches
batch_size = len(self.generators)
batch_inputs = [
inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
]
results = []
for batch_input in batch_inputs:
with ThreadPoolExecutor() as executor:
_results = list(
executor.map(
self._generate,
self.generators[:len(batch_input)],
self.generator_ids[:len(batch_input)],
batch_input,
[self.gen_config] * len(batch_input),
[self.end_str] * len(batch_input),
))
results += _results
return results
def get_token_len(self, prompt: str) -> int:
input_ids = self.tokenizer.encode(prompt)
return len(input_ids)
def wait(self):
"""Wait till the next query can be sent.
Applicable in both single-thread and multi-thread environments.
"""
return self.token_bucket.get_token()
def _generate(self,
generator,
session_id,
prompt: PromptType,
gen_config=None,
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs.
Args:
prompt (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
gen_config (GenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns:
str: The generated string.
"""
assert type(
prompt) is str, 'We only support string for TurboMind Python API'
input_ids = self.tokenizer.encode(prompt)
if self.major_version >= 0 and self.minor_version >= 4:
outputs = generator.infer(session_id,
input_ids,
gen_config=gen_config)
output_ids = outputs.token_ids
else:
_, output_ids, _ = generator.infer(session_id,
input_ids,
gen_config=gen_config)
# stop engine
if hasattr(generator, 'end'):
generator.end(session_id)
# decode output
response_all = self.tokenizer.decode(output_ids)
# trim output
if end_str:
response_all = response_all.split(end_str)[0]
# remove invalid characters
response_all = valid_str(response_all)
return response_all

View File

@ -1,200 +0,0 @@
import threading
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from queue import Queue
from typing import Dict, List, Optional, Union
import numpy as np
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret
def prepare_tensor(name, input_tensor):
"""Create grpcclient's InferInput instance according to a given tensor."""
import tritonclient.grpc as grpcclient
from tritonclient.utils import np_to_triton_dtype
t = grpcclient.InferInput(name, list(input_tensor.shape),
np_to_triton_dtype(input_tensor.dtype))
t.set_data_from_numpy(input_tensor)
return t
def stream_callback(que, result, error):
"""callback function invoked by triton client."""
que.put((result, error))
class LmdeployTisModel(BaseModel):
"""Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC
API.
Args:
path (str): The name of OpenAI's model.
tis_addr (str): The address (ip:port format) of turbomind's
triton inference server
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
"""
is_api: bool = True
def __init__(self,
path: str,
tis_addr: str = '0.0.0.0:33337',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
end_str: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template)
from lmdeploy.tokenizer import Tokenizer
self.logger = get_logger()
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.tis_addr = tis_addr
self.tokenizer = Tokenizer(path)
self.end_str = end_str
def generate(
self,
inputs: List[str or PromptList],
max_out_len: int = 512,
temperature: float = 1.0,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str or PromptList]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs),
[self.end_str] * len(inputs)))
return results
def wait(self):
"""Wait till the next query can be sent.
Applicable in both single-thread and multi-thread environments.
"""
return self.token_bucket.get_token()
def get_token_len(self, prompt: str) -> int:
input_ids = self.tokenizer.encode(prompt)
return len(input_ids)
def _call_triton_server(self, prompt, tis_addr, session_id,
request_output_len, temperature, res_que):
import tritonclient.grpc as grpcclient
with grpcclient.InferenceServerClient(tis_addr) as client:
inputs = [
prepare_tensor('prompt',
np.array([prompt.encode()], dtype=np.object_)),
prepare_tensor('max_tokens',
np.array([request_output_len], dtype=np.int32)),
prepare_tensor('temperature',
np.array([temperature], dtype=np.float_)),
prepare_tensor('top_p', np.array([1.0], dtype=np.float_)),
prepare_tensor('top_k', np.array([1], dtype=np.int32)),
prepare_tensor('ignore_eos', np.array([False],
dtype=np.bool_)),
prepare_tensor('stream', np.array([True], dtype=np.bool_)),
]
# async_stream
client.start_stream(partial(stream_callback, res_que))
client.async_stream_infer('lmdeploy_model',
inputs,
sequence_id=session_id,
sequence_start=True,
sequence_end=True)
res_que.put(None)
return
def _process_result(self, que):
text = ''
while True:
res = que.get()
if res is not None:
result, err = res
if err is not None:
print(err)
else:
res = result.as_numpy('response').item().decode()
text += res
else:
return text
def _generate(self,
prompt: str or PromptList,
max_out_len: int,
temperature: float,
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs.
Args:
prompt (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert type(
prompt
) is str, 'We only support string for LMDeploy Python Backend TIS API'
res_que = Queue()
self._call_triton_server(prompt=prompt,
tis_addr=self.tis_addr,
session_id=threading.currentThread().ident,
request_output_len=max_out_len,
temperature=temperature,
res_que=res_que)
text = self._process_result(res_que)
response = valid_str(text)
if end_str:
response = response.split(end_str)[0]
return response

View File

@ -601,6 +601,10 @@ class OpenAISDK(OpenAI):
if self.verbose:
self.logger.info(
'Successfully get response from OpenAI API')
try:
self.logger.info(responses)
except Exception as e: # noqa F841
pass
return responses.choices[0].message.content
except Exception as e:
self.logger.error(e)

View File

@ -1,135 +0,0 @@
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret
class TurboMindTisModel(BaseModel):
"""Model wrapper for TurboMind Triton Inference Server gRPC API.
Args:
path (str): The name of OpenAI's model.
tis_addr (str): The address (ip:port format) of turbomind's
triton inference server
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
"""
is_api: bool = True
def __init__(
self,
path: str,
tis_addr: str = '0.0.0.0:33337',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template)
from lmdeploy.serve.turbomind.utils import Preprocessor
self.preprocess = Preprocessor(tis_addr)
self.logger = get_logger()
self.template_parser = LMTemplateParser(meta_template)
self.eos_token_id = None
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.tis_addr = tis_addr
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
temperature: float = 1.0,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs)))
return results
def get_token_len(self, prompt: str) -> int:
input_ids, _ = self.preprocess(prompt)
return input_ids.shape[-1]
def wait(self):
"""Wait till the next query can be sent.
Applicable in both single-thread and multi-thread environments.
"""
return self.token_bucket.get_token()
def _generate(self, prompt: PromptType, max_out_len: int,
temperature: float) -> str:
"""Generate results given a list of inputs.
Args:
prompt (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert type(
prompt) is str, 'We only support string for TurboMind RPC API'
from lmdeploy.serve.turbomind.chatbot import Chatbot
chatbot = Chatbot(self.tis_addr,
temperature=temperature,
capability='completion',
top_k=1,
log_level=logging.ERROR)
for status, text, n_token in chatbot.stream_infer(
session_id=threading.currentThread().ident,
prompt=prompt,
request_output_len=max_out_len,
sequence_start=True,
sequence_end=True):
continue
response = valid_str(text)
response = response.replace('<eoa>', '')
return response

View File

@ -1,7 +1,6 @@
# flake8: noqa
# yapf: disable
import copy
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel
@ -31,38 +30,32 @@ class TurboMindModelwithChatTemplate(BaseModel):
self,
path: str,
tokenizer_only: bool = False,
backend: str = 'turbomind',
engine_config: Dict = {},
gen_config: Dict = {},
concurrency: int = 8,
max_seq_len: int = None,
meta_template: Optional[Dict] = None,
fastchat_template: Optional[str] = None,
stop_words: List[str] = [],
):
from lmdeploy.messages import TurbomindEngineConfig
from lmdeploy.turbomind import TurboMind
from lmdeploy.version import version_info
from transformers import AutoTokenizer
self.logger = get_logger()
self.path = path
self.tokenizer_only = tokenizer_only
self.template_parser = _get_meta_template(meta_template)
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
from lmdeploy import version_info
from transformers import AutoTokenizer
self.version_info = version_info
self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
if not tokenizer_only:
DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len}
_engine_config = DEFAULT_ENGING_CONFIG.copy()
_engine_config.update(engine_config)
engine_config = TurbomindEngineConfig(**_engine_config)
tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
self.tokenizer = tm_model.tokenizer
self.generators = [tm_model.create_instance() for i in range(concurrency)]
self.generator_ids = [i + 1 for i in range(concurrency)]
self.concurrency = concurrency
self.pipe = self._build_pipe(path, backend, _engine_config)
else:
self.pipe = None
self.gen_config = gen_config
self.version_info = version_info
self.fastchat_template = fastchat_template
self.stop_words = list(set(stop_words + self._get_potential_stop_words(path)))
self.logger.info(f'using stop words: {self.stop_words}')
@ -76,23 +69,23 @@ class TurboMindModelwithChatTemplate(BaseModel):
generation_config = None
if generation_config and hasattr(generation_config, 'eos_token_id'):
if isinstance(generation_config.eos_token_id, int):
potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id))
potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
else:
assert isinstance(generation_config.eos_token_id, list)
for token_id in generation_config.eos_token_id:
potential_stop_words.append(self.origin_tokenizer.decode(token_id))
if self.origin_tokenizer.eos_token is not None:
potential_stop_words.append(self.origin_tokenizer.eos_token)
potential_stop_words.append(self.tokenizer.decode(token_id))
if self.tokenizer.eos_token is not None:
potential_stop_words.append(self.tokenizer.eos_token)
potential_stop_words = list(set(potential_stop_words))
potential_stop_words = [s for s in potential_stop_words if s]
return potential_stop_words
def generate(self,
inputs: List[str],
max_out_len: int = 512,
max_out_len: int,
stopping_criteria: List[str] = [],
do_sample: Optional[bool] = None,
temperature: int = 1,
temperature: float = 1.0,
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
@ -104,93 +97,45 @@ class TurboMindModelwithChatTemplate(BaseModel):
List[str]: A list of generated strings.
"""
assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}'
messages = _convert_chat_messages(inputs)
if self.fastchat_template:
messages = _format_with_fast_chat_template(messages, self.fastchat_template)
else:
messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
# split messages into batches
batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
stop_words = list(set(self.stop_words + stopping_criteria))
encode_stop_words = []
if stop_words is not None and len(stop_words) > 0:
for words in stop_words:
encode_stop_words += self.tokenizer.encode(words, add_bos=False)
DEFAULT_GEN_CONFIG = {
'max_new_tokens': max_out_len,
'min_new_tokens': 1,
'top_k': 1,
'stop_words': encode_stop_words,
'stop_words': stop_words,
}
gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
gen_config.update(self.gen_config)
if do_sample:
gen_config['top_k'] = 1000
gen_config['top_k'] = 40
gen_config['temperature'] = temperature
else:
if self.version_info >= (0, 6, 0):
gen_config['do_sample'] = False
else:
gen_config['top_k'] = 1
from lmdeploy.messages import GenerationConfig
from lmdeploy import GenerationConfig
gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)}
gen_config = GenerationConfig(**gen_config)
if self.version_info >= (0, 6, 0):
gen_config.stop_words = stop_words
gen_config.convert_stop_bad_words_to_ids(self.tokenizer)
results = []
for batch_message in batch_messages:
n = len(batch_message)
with ThreadPoolExecutor() as executor:
_results = list(
executor.map(
self._generate,
self.generators[:n],
self.generator_ids[:n],
batch_message,
[gen_config] * n,
))
results += _results
outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
for output in outputs:
text = self.tokenizer.decode(output.token_ids)
results.append(text)
for s in stop_words:
results = [r.split(s)[0] for r in results]
return results
def _generate(self,
generator,
session_id,
prompt: PromptType,
gen_config=None) -> str:
"""Generate results given a list of inputs.
Args:
prompt (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
gen_config (GenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
Returns:
str: The generated string.
"""
assert type(prompt) is str, 'We only support string for TurboMind Python API'
input_ids = self.tokenizer.encode(prompt, add_bos=False)
for outputs in generator.stream_infer(session_id=session_id,
input_ids=[input_ids],
gen_config=gen_config,
sequence_start=True,
sequence_end=True,
step=0,
stream_output=False):
if self.version_info >= (0, 4, 0):
output_ids = outputs.token_ids
else:
_, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids)
response = valid_str(response)
return response
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
@ -201,5 +146,20 @@ class TurboMindModelwithChatTemplate(BaseModel):
int: Length of the input tokens
"""
m = _convert_chat_messages([prompt])[0]
t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
return len(t['input_ids'])
def _build_pipe(self, model_path, backend, engine_config):
from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig,
pipeline)
assert backend in ['pytorch', 'turbomind'], \
f'unsupported backend type: {backend}'
if backend == 'turbomind':
filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)}
backend_config = TurbomindEngineConfig(**filtered)
else:
filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)}
backend_config = PytorchEngineConfig(**filtered)
return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10)

View File

@ -232,6 +232,8 @@ class DLCRunner(BaseRunner):
while True:
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10)
time.sleep(dlc_sleep_time)
num_retry = 60
for retry_index in range(num_retry):
time.sleep(2)

View File

@ -4,6 +4,7 @@ from .all_obj import AllObjSummarizer
from .alpacaeval import AlpacaSummarizer
from .arenahard import ArenaHardSummarizer
from .charm import CharmMemSummarizer
from .common_summarizer import CommonSummarizer
from .compass_arena import CompassArenaSummarizer
from .compassbench import CompassBenchSummarizer
from .corev2 import Corev2Summarizer

View File

@ -0,0 +1,146 @@
# flake8: noqa
# yapf: disable
import csv
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime
import numpy as np
from mmengine import ConfigDict
from tabulate import tabulate
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from .compass_arena import CompassArenaSummarizer
from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_single_rate(judgement: str):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
matched_result = re.findall(pattern, judgement)
if matched_result:
score = float(matched_result[0])
else:
return None
return {'score': score}
def get_capability_results(
judged_answers,
references,
fout,
fout_flag,
model_abbr,
judge_model_abbr,
dataset_abbr,
):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total')))
if fout_flag == 0:
with open(fout, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(['model', 'judge_model', 'dataset'] + columns)
writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
else:
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
class CommonSummarizer(CompassArenaSummarizer):
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, judge_type='single_rate') -> None:
self.judge_type = judge_type
self.tasks = []
self.cfg = config
self.judge_type = 'single_rate'
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.judge_model_cfgs = self.cfg['judge_models']
self.judge_map = {
'single_rate': post_process_single_rate
}
self.judge_function = self.judge_map[self.judge_type]
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
if self.judge_type == 'pair':
return super().summarize()
# self.judge_type == 'single'
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
output_tmp_file = osp.join(output_dir, 'result.csv')
output_file = osp.join(output_dir, 'total_result.csv')
for eval_model_cfg in self.eval_model_cfgs:
for judge_model_cfg in self.judge_model_cfgs:
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg)
judge_abbr = model_abbr_from_cfg(judge_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
if os.path.isdir(subdir_path):
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
show_dataset_abbr = dataset_abbr_from_cfg(dataset)
get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(output_tmp_file, 'r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
table = [line for line in csv_reader]
new_header = [''] + [line[0] for line in table]
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
t = tabulate(new_table, headers=new_header)
with open(output_file, 'a') as f:
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(output_file)

View File

@ -9,7 +9,7 @@ from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel,
HuggingFaceCausalLM, HuggingFaceChatGLM3,
HuggingFacewithChatTemplate, TurboMindModel,
HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate,
VLLMwithChatTemplate)
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
@ -233,7 +233,7 @@ def change_accelerator(models, accelerator):
model_accels = []
for model in models:
logger.info(f'Transforming {model["abbr"]} to {accelerator}')
# change HuggingFace model to VLLM or TurboMindModel
# change HuggingFace model to VLLM or LMDeploy
if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']:
gen_args = dict()
if model.get('generation_kwargs') is not None:
@ -254,10 +254,10 @@ def change_accelerator(models, accelerator):
if accelerator == 'lmdeploy':
logger.info(f'Transforming {model["abbr"]} to {accelerator}')
mod = TurboMindModel
mod = TurboMindModelwithChatTemplate
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
path=model['path'],
engine_config=dict(session_len=model['max_seq_len'],
max_batch_size=model['batch_size'],
@ -270,7 +270,6 @@ def change_accelerator(models, accelerator):
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
concurrency=model['batch_size'],
run_cfg=model['run_cfg'],
)
for item in ['meta_template']:
@ -312,7 +311,7 @@ def change_accelerator(models, accelerator):
mod = TurboMindModelwithChatTemplate
acc_model = dict(
type=f'{mod.__module__}.{mod.__name__}',
abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
path=model['path'],
engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),

View File

@ -1,6 +1,7 @@
# Alpaca-eval
alpaca-eval==0.6
cn2an
dingo-python
# Icl topk retriever
faiss_gpu==1.7.2
# Humaneval, Humaneval X

View File

@ -23,6 +23,7 @@ python-Levenshtein
rank_bm25==0.2.2
rapidfuzz
requests>=2.31.0
retrying
rich
rouge
-e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese