Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2024-10-12 11:05:00 +08:00 · 2024-10-12 11:05:00 +08:00 · 346c06015a
commit 346c06015a
parent 22685e9c48 b52ba65c26
56 changed files with 2145 additions and 1032 deletions
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@ -8,15 +8,17 @@ with read_base():
        race_datasets  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
+        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
    # read hf models - chat models
    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
        models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b import \
-        models as hf_gemma_2b_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b import \
-        models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b import \
+        models as hf_gemma2_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b import \
+        models as hf_gemma2_9b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
        models as hf_internlm2_5_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
@ -31,16 +33,28 @@ with read_base():
        models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
+        models as hf_llama2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
+        models as hf_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
+        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
-        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
+        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
        models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
+        models as vllm_mixtral_8x7b_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
        models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
        models as hf_qwen2_0_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
+        models as hf_qwen2_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b import \
+        models as hf_qwen2_7b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@ -13,20 +13,32 @@ with read_base():
        models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
        models as hf_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
+        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
        models as hf_deepseek_7b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
+        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
        models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
-        models as hf_gemma_2b_it_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
-        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
+        models as hf_gemma2_2b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
+        models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
+        models as vllm_gemma_7b_it_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
+        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
+        models as lmdeploy_internlm2_5_20b_chat_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
@ -37,14 +49,20 @@ with read_base():
        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
        models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
+        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
        models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
-        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
+        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
        models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
+        models as vllm_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
        models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@ -57,6 +75,10 @@ with read_base():
        models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
        models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
+        models as hf_qwen2_1_5b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
+        models as hf_qwen2_7b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@ -8,29 +8,33 @@ output_path = 'regression_result_daily'

 chat_model_list = [
    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
-    'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
-    'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
-    'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
-    'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
-    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
-    'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
-    'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
-    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
+    'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
+    'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
+    'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
+    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
+    'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
+    'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
+    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
+    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
    'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
    'lmdeploy-api-test'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
-    'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
-    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
-    'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
-    'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
-    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
-    'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
-    'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
-    'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
+    'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
+    'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
+    'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
+    'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
+    'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
+    'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
+    'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
+    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']

--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@ -8,6 +8,16 @@ glm-4-9b-chat-hf:
    race-middle: 88
    race-high: 88

+glm-4-9b-chat-turbomind:
+    gsm8k: 69
+    race-middle: 82
+    race-high: 77
+
+glm-4-9b-chat-vllm:
+    gsm8k: 73
+    race-middle: 87
+    race-high: 87
+
 deepseek-7b-chat-hf:
    gsm8k: 60
    race-middle: 74
@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf:
    race-middle: 62
    race-high: 70

+deepseek-v2-lite-chat-hf:
+    gsm8k: 59
+    race-middle: 82
+    race-high: 79
+
 deepseek-7b-chat-vllm:
    gsm8k: 63
    race-middle: 74
@ -33,23 +48,48 @@ gemma-7b-it-hf:
    race-middle: 74
    race-high: 71

+gemma-7b-it-vllm:
+    gsm8k: 38
+    race-middle: 75
+    race-high: 70
+
+gemma2-2b-it-hf:
+    gsm8k: 62
+    race-middle: 75
+    race-high: 67
+
+gemma2-9b-it-hf:
+    gsm8k: 80
+    race-middle: 89
+    race-high: 85
+
 internlm2_5-7b-chat-hf:
    gsm8k: 86
    race-middle: 92
    race-high: 93

+internlm2_5-20b-chat-hf:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2_5-7b-chat-turbomind:
    gsm8k: 87
    race-middle: 92
    race-high: 93

+internlm2_5-20b-chat-turbomind:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2-chat-1.8b-turbomind:
    gsm8k: 40
    race-middle: 82
    race-high: 83

 internlm2-chat-1.8b-sft-turbomind:
-    gsm8k: 32
+    gsm8k: 34
    race-middle: 81
    race-high: 83

@ -68,11 +108,21 @@ internlm2-chat-7b-vllm:
    race-middle: 90
    race-high: 91

+llama-3_1-8b-instruct-hf:
+    gsm8k: 82
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-hf:
    gsm8k: 77
    race-middle: 85
    race-high: 87

+llama-3_1-8b-instruct-turbomind:
+    gsm8k: 79
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-turbomind:
    gsm8k: 77
    race-middle: 85
@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf:
    race-middle: 82
    race-high: 78

+mistral-7b-instruct-v0.3-hf:
+    gsm8k: 53
+    race-middle: 80
+    race-high: 78
+
 mistral-7b-instruct-v0.2-vllm:
    gsm8k: 49
    race-middle: 81
@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf:
    race-middle: 55
    race-high: 50

+qwen2-1.5b-instruct-hf:
+    gsm8k: 63
+    race-middle: 77
+    race-high: 86
+
 qwen2-1.5b-instruct-turbomind:
    gsm8k: 60
    race-middle: 77
@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind:
    race-middle: 87
    race-high: 89

+qwen2-7b-instruct-hf:
+    gsm8k: 85
+    race-middle: 87
+    race-high: 91
+
 qwen1.5-0.5b-chat-vllm:
    gsm8k: 5
    race-middle: 57
@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf:
    race-middle: 35
    race-high: 23

+deepseek-v2-lite-hf:
+    gsm8k: 37
+    race-middle: 56
+    race-high: 62
+
 deepseek-7b-base-turbomind:
    gsm8k: 21
    race-middle: 42
@ -173,8 +243,18 @@ gemma-7b-hf:
    race-middle: 59
    race-high: 66

+gemma2-2b-hf:
+    gsm8k: 33
+    race-middle: 56
+    race-high: 58
+
+gemma2-9b-hf:
+    gsm8k: 70
+    race-middle: 82
+    race-high: 84
+
 internlm2_5-7b-hf:
-    gsm8k: 46
+    gsm8k: 47
    race-middle: 92
    race-high: 91

@ -208,6 +288,21 @@ internlm2-base-7b-turbomind:
    race-middle: 75
    race-high: 81

+llama-2-7b-hf:
+    gsm8k: 17
+    race-middle: 32
+    race-high: 38
+
+llama-3-8b-hf:
+    gsm8k: 48
+    race-middle: 64
+    race-high: 70
+
+llama-3.1-8b-turbomind:
+    gsm8k: 57
+    race-middle: 67
+    race-high: 75
+
 llama-3-8b-turbomind:
    gsm8k: 52
    race-middle: 63
@ -218,6 +313,11 @@ mistral-7b-v0.2-hf:
    race-middle: 42
    race-high: 60

+mistral-7b-v0.3-hf:
+    gsm8k: 43
+    race-middle: 42
+    race-high: 60
+
 mistral-7b-v0.2-vllm:
    gsm8k: 45
    race-middle: 42
@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf:
    race-middle: 78
    race-high: 90

+qwen2-1.5b-hf:
+    gsm8k: 58
+    race-middle: 65
+    race-high: 78
+
 qwen2-0.5b-hf:
    gsm8k: 35
    race-middle: 52
    race-high: 48

+qwen2-7b-hf:
+    gsm8k: 82
+    race-middle: 88
+    race-high: 89
+
 qwen2-1.5b-turbomind:
    gsm8k: 57
    race-middle: 64
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@ -14,9 +14,14 @@ env:
  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
  DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
  HF_DATASETS_OFFLINE: 1
+  HF_EVALUATE_OFFLINE: 1
  TRANSFORMERS_OFFLINE: 1
+  VLLM_USE_MODELSCOPE: false
+  LMDEPLOY_USE_MODELSCOPE: false
  HF_HUB_OFFLINE: 1
  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas

@ -43,7 +48,11 @@ jobs:

  daily_run_test:
    needs: build-pypi
-    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_env: [dsw_cu11, dsw_cu12]
+    runs-on: ${{ matrix.cuda_env }}
    environment: 'prod'
    timeout-minutes: 420 #7hours
    steps:
@ -53,22 +62,38 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}
-      - name: Prepare - create conda env and install torch
+      - name: Prepare - create conda env and install torch - cu11
+        if: ${{matrix.cuda_env == 'dsw_cu11'}}
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}} python=3.10
-          conda activate ${{env.CONDA_ENV}}
-          pip install opencompass*.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-
-          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
          pip uninstall torch torchvision torchaudio -y
          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
          conda info --envs
          pip list
+      - name: Prepare - create conda env and install torch - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12'}}
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir
+          pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip uninstall torch torchvision torchaudio -y
+          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda info --envs
+          pip list
      - name: Prepare - prepare data and hf model
        run: |
          ln -s ${{env.DATEASET_CACHE_PATH}} data
@ -77,45 +102,45 @@ jobs:
      - name:  Run chat model test
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
-          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run base model test
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
-          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Run command testcase
        run: |
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs
          export from_tf=TRUE
          python tools/list_configs.py internlm2_5 mmlu
-          python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
+          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
+          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
          python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
      - name:  Remove Conda Env
        if: always()
        run: |
          rm -rf regression_result_daily
          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}
+          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
          conda info --envs

  notify_to_feishu:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -17,7 +17,7 @@ jobs:
          python-version: '3.10'
      - name: Install pre-commit hook
        run: |
-          pip install pre-commit mmengine
+          pip install pre-commit==3.8.0 mmengine
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@ -51,7 +51,7 @@ jobs:
          conda activate ${{env.CONDA_ENV}}
          conda info --envs
          rm -rf regression_result
-          python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
+          opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
      - name:  Get result
        run: |
          score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
--- a/README.md
+++ b/README.md
@ -594,7 +594,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
 ## 🔜 Roadmap

 - [x] Subjective Evaluation
-  - [x] Release CompassAreana
+  - [x] Release CompassAreana.
  - [x] Subjective evaluation.
 - [x] Long-context
  - [x] Long-context evaluation with extensive datasets.
@ -603,10 +603,10 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
  - [ ] Coding evaluation leaderboard.
  - [x] Non-python language evaluation service.
 - [x] Agent
-  - [ ] Support various agenet framework.
+  - [ ] Support various agent frameworks.
  - [x] Evaluation of tool use of the LLMs.
 - [x] Robustness
-  - [x] Support various attack method
+  - [x] Support various attack methods.

 ## 👷‍♂️ Contributing

--- a/configs/api_examples/eval_api_bailing.py
+++ b/configs/api_examples/eval_api_bailing.py
@ -0,0 +1,38 @@
+from mmengine.config import read_base
+
+from opencompass.models import BailingAPI
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners.local_api import LocalAPIRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+
+datasets = [
+    *ceval_datasets,
+]
+
+models = [
+    dict(
+        path='Bailing-Lite-0830',
+        token='xxxxxx',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
+        type=BailingAPI,
+        generation_kwargs={},
+        query_per_second=1,
+        max_seq_len=4096,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalAPIRunner,
+        max_num_workers=2,
+        concurrent_users=2,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+work_dir = 'outputs/api_bailing/'
--- a/configs/datasets/dingo/dingo_gen.py
+++ b/configs/datasets/dingo/dingo_gen.py
@ -0,0 +1,34 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DingoDataset, DingoEvaluator
+
+
+dingo_paths = [
+    './data/dingo/en_192.csv',
+    './data/dingo/zh_170.csv',
+]
+
+dingo_datasets = []
+for path in dingo_paths:
+    dingo_reader_cfg = dict(input_columns='input', output_column=None)
+    dingo_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
+
+    dingo_datasets.append(
+        dict(
+            abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
+            type=DingoDataset,
+            path=path,
+            reader_cfg=dingo_reader_cfg,
+            infer_cfg=dingo_infer_cfg,
+            eval_cfg=dingo_eval_cfg,
+        ))
+
+datasets = dingo_datasets
--- a/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/configs/datasets/subjective/followbench/followbench_llmeval.py
@ -15,7 +15,7 @@ subjective_all_sets = [
 ]
 data_path ='data/subjective/followbench/converted_data'

-followbench_llmeval_dataset = []
+followbench_llmeval_datasets = []

 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
        pred_role='BOT',
    )

-    followbench_llmeval_dataset.append(
+    followbench_llmeval_datasets.append(
        dict(
            abbr=f'{_name}',
            type=FollowBenchDataset,
--- a/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
+++ b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
@ -0,0 +1,73 @@
+import copy
+
+from opencompass.datasets import WikiBenchDataset
+from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+single_choice_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN',
+             prompt='问题: 白色念珠菌常被用作哪种生物的研究模式？\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 星期五广场（荷兰语：Vrijdagmarkt；荷兰语发音： ）是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施？\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数？\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
+        ),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 陈酆被任命为漳州刺史是因为什么原因？\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN',
+             prompt='问题: 丹徒县在1928年改名为什么？\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ]
+}
+
+wikibench_sets = {
+    'wiki': ['single_choice_cn'],
+}
+
+do_circular = True
+
+wikibench_datasets = []
+
+for _split in list(wikibench_sets.keys()):
+    for _name in wikibench_sets[_split]:
+        template = {}
+        for answer in ['A', 'B', 'C', 'D']:
+            one_template_round = copy.deepcopy(single_choice_prompts[_name])
+            one_template_round[-1]['prompt'] = one_template_round[-1][
+                'prompt'].format(answer=answer)
+            template[answer] = dict(round=one_template_round)
+        wikibench_infer_cfg = dict(
+            prompt_template=dict(type=PromptTemplate, template=template),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=PPLInferencer),
+        )
+        wikibench_eval_cfg = dict(evaluator=dict(
+            type=CircularEvaluator if do_circular else AccEvaluator), )
+        wikibench_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                path=f'./data/WikiBench/{_name}.jsonl',
+                name='circular_' + _name if do_circular else _name,
+                abbr='wikibench-' + _split + '-' + _name +
+                'circular' if do_circular else '',
+                reader_cfg=dict(
+                    input_columns=['question'],
+                    output_column='answer',
+                ),
+                infer_cfg=wikibench_infer_cfg,
+                eval_cfg=wikibench_eval_cfg,
+            ))
--- a/configs/eval_corebench_2409_base_objective.py
+++ b/configs/eval_corebench_2409_base_objective.py
@ -0,0 +1,188 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
+
+    # ## Math
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets
+
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
+        gpqa_datasets
+
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets
+    # TODO: Add LiveCodeBench
+
+    # ## Instruction Following
+    # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups
+
+    # Model List
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name': 'core_average',
+        'subsets': [
+            ['mmlu', 'accuracy'],
+            ['mmlu_pro', 'accuracy'],
+            ['cmmlu', 'accuracy'],
+            ['bbh', 'naive_average'],
+            ['hellaswag', 'accuracy'],
+            ['drop', 'accuracy'],
+            ['math', 'accuracy'],
+            ['gsm8k', 'accuracy'],
+            ['mathbench-t (average)', 'naive_average'],
+            ['GPQA_diamond', 'accuracy'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['sanitized_mbpp', 'score'],
+            ['mathbench-t (average)', 'naive_average']
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['hellaswag', 'accuracy'],
+        ['drop', 'accuracy'],
+        ['math', 'accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['mathbench-t (average)', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['sanitized_mbpp', 'score'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math','accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science','accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'base_objective')
--- a/configs/eval_corebench_2409_chat_objective.py
+++ b/configs/eval_corebench_2409_chat_objective.py
@ -0,0 +1,220 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
+
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets
+
+    # ## Math
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets
+
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets
+    # TODO: Add LiveCodeBench
+
+    # ## Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name': 'core_average',
+        'subsets': [
+            ['mmlu', 'accuracy'],
+            ['mmlu_pro', 'accuracy'],
+            ['cmmlu', 'accuracy'],
+            ['bbh', 'score'],
+            ['math', 'accuracy'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['GPQA_diamond', 'accuracy'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['drop', 'accuracy'],
+            ['sanitized_mbpp', 'score'],
+            ['gsm8k', 'accuracy'],
+            ['hellaswag', 'accuracy'],
+            ['mathbench-t (average)', 'naive_average']
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'score'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['GPQA_diamond', 'accuracy'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['drop', 'accuracy'],
+        ['sanitized_mbpp', 'score'],
+        ['gsm8k', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math','accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science','accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+        '',
+        ['bbh', 'extract_rate'],
+        ['math', 'extract_rate'],
+        # ['openai_humaneval', 'extract_rate'],
+        ['GPQA_diamond', 'extract_rate'],
+        # ['IFEval', 'extract_rate'],
+        '',
+        ['mmlu', 'extract_rate'],
+        ['mmlu-stem', 'extract_rate'],
+        ['mmlu-social-science', 'extract_rate'],
+        ['mmlu-humanities', 'extract_rate'],
+        ['mmlu-other', 'extract_rate'],
+        '',
+        ['mmlu_pro', 'extract_rate'],
+        ['mmlu_pro_math', 'extract_rate'],
+        ['mmlu_pro_physics', 'extract_rate'],
+        ['mmlu_pro_chemistry', 'extract_rate'],
+        ['mmlu_pro_law', 'extract_rate'],
+        ['mmlu_pro_engineering', 'extract_rate'],
+        ['mmlu_pro_other', 'extract_rate'],
+        ['mmlu_pro_economics', 'extract_rate'],
+        ['mmlu_pro_health', 'extract_rate'],
+        ['mmlu_pro_psychology', 'extract_rate'],
+        ['mmlu_pro_business', 'extract_rate'],
+        ['mmlu_pro_biology', 'extract_rate'],
+        ['mmlu_pro_philosophy', 'extract_rate'],
+        ['mmlu_pro_computer_science', 'extract_rate'],
+        ['mmlu_pro_history', 'extract_rate'],
+        '',
+        ['cmmlu', 'extract_rate'],
+        ['cmmlu-stem', 'extract_rate'],
+        ['cmmlu-social-science', 'extract_rate'],
+        ['cmmlu-humanities', 'extract_rate'],
+        ['cmmlu-other', 'extract_rate'],
+        ['cmmlu-china-specific', 'extract_rate'],
+
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
--- a/configs/eval_corebench_2409_longcontext.py
+++ b/configs/eval_corebench_2409_longcontext.py
@ -0,0 +1,138 @@
+import os.path as osp
+from copy import deepcopy
+
+from mmengine.config import read_base
+from opencompass.models import (HuggingFacewithChatTemplate,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import DLCRunner, LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    from opencompass.configs.datasets.longbench.longbench import \
+        longbench_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
+        needlebench_datasets as needlebench_8k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
+        needlebench_datasets as needlebench_32k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
+        needlebench_datasets as needlebench_128k_datasets
+    from opencompass.configs.datasets.ruler.ruler_8k_gen import \
+        ruler_datasets as ruler_8k_datasets
+    from opencompass.configs.datasets.ruler.ruler_32k_gen import \
+        ruler_datasets as ruler_32k_datasets
+    from opencompass.configs.datasets.ruler.ruler_128k_gen import \
+        ruler_datasets as ruler_128k_datasets
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.longbench import \
+        longbench_summary_groups
+    from opencompass.configs.summarizers.groups.ruler import \
+        ruler_summary_groups
+    from opencompass.configs.summarizers.needlebench import (
+        needlebench_8k_summarizer, needlebench_32k_summarizer,
+        needlebench_128k_summarizer)
+
+    # Instruct models
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
+        models as lmdeploy_internlm2_5_7b_1m_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as llama3_1_8b_instruct_model
+
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
+needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
+needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
+
+# Instruct models summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        ['ruler_8k', 'naive_average'],
+        ['ruler_32k', 'naive_average'],
+        ['ruler_128k', 'naive_average'],
+        ['NeedleBench-Overall-Score-8K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-32K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-128K', 'weighted_average'],
+        ['longbench', 'naive_average'],
+        ['longbench_zh', 'naive_average'],
+        ['longbench_en', 'naive_average'],
+        '',
+        'longbench_single-document-qa',
+        'longbench_multi-document-qa',
+        'longbench_summarization',
+        'longbench_few-shot-learning',
+        'longbench_synthetic-tasks',
+        'longbench_code-completion',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
+llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'long_context')
--- a/configs/eval_corebench_2409_subjective.py
+++ b/configs/eval_corebench_2409_subjective.py
@ -0,0 +1,134 @@
+import os.path as osp
+from copy import deepcopy
+
+from mmengine.config import read_base
+from opencompass.models import (HuggingFacewithChatTemplate,
+                                TurboMindModelwithChatTemplate)
+from opencompass.models.openai_api import OpenAI, OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import DLCRunner, LocalRunner
+from opencompass.summarizers import SubjectiveSummarizer
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
+        alignbench_datasets
+    from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
+        mtbench_datasets
+
+    # Summarizer
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2_5-7b-chat-turbomind',
+        path='internlm/internlm2_5-7b-chat',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096),
+        max_seq_len=16384,
+        max_out_len=4096,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# JudgeLLM
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+
+judge_models = [
+    dict(
+        type=OpenAISDK,
+        abbr='gpt-4o-2024-08-06',
+        path='gpt-4o-2024-08-06',
+        # openai_api_base=
+        # 'http://10.140.1.86:10001/v1',  # Change to your own url if needed.
+        key='YOUR_API_KEY',
+        retry=10,
+        meta_template=api_meta_template,
+        rpm_verbose=True,
+        query_per_second=1,
+        max_out_len=4096,
+        max_seq_len=16384,
+        batch_size=16,
+        temperature=0.01,
+        tokenizer_path='gpt-4o-2024-08-06'
+    )
+]
+
+# Evaluation with local runner
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=SubjectiveEvalTask)),
+)
+
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'chat_subjective')
--- a/configs/eval_dingo.py
+++ b/configs/eval_dingo.py
@ -0,0 +1,7 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .models.hf_internlm.hf_internlm_7b import models
+    from .datasets.dingo.dingo_gen import datasets
+
+work_dir = './outputs/eval_dingo'
--- a/configs/eval_internlm_chat_lmdeploy_pytorch.py
+++ b/configs/eval_internlm_chat_lmdeploy_pytorch.py
@ -1,69 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models import LmdeployPytorchModel
-
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
-        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
-    ],
-    eos_token_id=103028)
-
-# config for internlm-chat-7b
-internlm_chat_7b = dict(
-    type=LmdeployPytorchModel,
-    abbr='internlm-chat-7b-pytorch',
-    path='internlm/internlm-chat-7b',
-    engine_config=dict(session_len=2048,
-                       max_batch_size=16),
-    gen_config=dict(top_k=1,
-                    top_p=0.8,
-                    temperature=1.0,
-                    max_new_tokens=100),
-    max_out_len=100,
-    max_seq_len=2048,
-    batch_size=16,
-    concurrency=16,
-    meta_template=meta_template,
-    run_cfg=dict(num_gpus=1, num_procs=1),
-    end_str='<eoa>',
-)
-
-# config for internlm-chat-20b
-internlm_chat_20b = dict(
-    type=LmdeployPytorchModel,
-    abbr='internlm-chat-20b-pytorch',
-    path='internlm/internlm-chat-20b',
-    engine_config=dict(session_len=2048,
-                       max_batch_size=8),
-    gen_config=dict(top_k=1,
-                    top_p=0.8,
-                    temperature=1.0,
-                    max_new_tokens=100),
-    max_out_len=100,
-    max_seq_len=2048,
-    batch_size=8,
-    concurrency=8,
-    meta_template=meta_template,
-    run_cfg=dict(num_gpus=1, num_procs=1),
-    end_str='<eoa>',
-    )
-
-models = [internlm_chat_20b]
--- a/configs/eval_internlm_chat_lmdeploy_tis.py
+++ b/configs/eval_internlm_chat_lmdeploy_tis.py
@ -1,41 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.lmdeploy_tis import LmdeployTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-    eos_token_id=92542
-)
-
-models = [
-    dict(
-        type=LmdeployTisModel,
-        abbr='internlm-chat-20b-lmdeploy-tis',
-        path='internlm/internlm-chat-20b',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<|im_end|>',
-    )
-]
--- a/configs/eval_internlm_chat_turbomind_tis.py
+++ b/configs/eval_internlm_chat_turbomind_tis.py
@ -1,40 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.turbomind_tis import TurboMindTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|User|>:', end='\n'),
-        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
-    ],
-    eos_token_id=103028)
-
-models = [
-    dict(
-        type=TurboMindTisModel,
-        abbr='internlm-chat-20b-turbomind',
-        path='internlm',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
--- a/configs/eval_internlm_turbomind_tis.py
+++ b/configs/eval_internlm_turbomind_tis.py
@ -1,28 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.turbomind_tis import TurboMindTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-models = [
-    dict(
-        type=TurboMindTisModel,
-        abbr='internlm-chat-20b-turbomind',
-        path='internlm',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
--- a/configs/models/bailing_api/bailing-lite-0830.py
+++ b/configs/models/bailing_api/bailing-lite-0830.py
@ -0,0 +1,30 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        path='Bailing-Lite-0830',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
+        },
+    ),
+]
--- a/configs/models/bailing_api/bailing-pro-0920.py
+++ b/configs/models/bailing_api/bailing-pro-0920.py
@ -0,0 +1,30 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        path='Bailing-Pro-0920',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
+        },
+    ),
+]
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
@ -1,15 +1,24 @@
 from opencompass.models import TurboMindModelwithChatTemplate

+
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
-        abbr='internlm2-chat-7b-turbomind',
+        abbr=f'internlm2-chat-7b-lmdeploy',
        path='internlm/internlm2-chat-7b',
-        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
-        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
        max_seq_len=8192,
        max_out_len=4096,
-        batch_size=16,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
+++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-1.5b-turbomind',
+        path='Qwen/Qwen2.5-1.5B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
+++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-7b-turbomind',
+        path='Qwen/Qwen2.5-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/docs/en/advanced_guides/evaluation_lmdeploy.md
+++ b/docs/en/advanced_guides/evaluation_lmdeploy.md
@ -0,0 +1,88 @@
+# Evaluation with LMDeploy
+
+We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
+
+## Setup
+
+### Install OpenCompass
+
+Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
+
+### Install LMDeploy
+
+Install lmdeploy via pip (python 3.8+)
+
+```shell
+pip install lmdeploy
+```
+
+The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
+
+```shell
+export LMDEPLOY_VERSION=0.6.0
+export PYTHON_VERSION=310
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## Evaluation
+
+When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
+
+Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
+
+```python
+# configure the dataset
+from mmengine.config import read_base
+
+
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    # and output the results in a chosen format
+    from .summarizers.medium import summarizer
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# configure lmdeploy
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+
+# configure the model
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr=f'internlm2-chat-7b-lmdeploy',
+        # model path, which can be the address of a model repository on the Hugging Face Hub or a local path
+        path='internlm/internlm2-chat-7b',
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
+        # the max size of the context window
+        max_seq_len=7168,
+        # the max number of new tokens
+        max_out_len=1024,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+```
+
+Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
+
+```shell
+python run.py configs/eval_internlm2_lmdeploy.py -w outputs
+```
+
+You are expected to get the evaluation results after the inference and evaluation.
--- a/docs/en/advanced_guides/evaluation_turbomind.md
+++ b/docs/en/advanced_guides/evaluation_turbomind.md
@ -1,78 +0,0 @@
-# Evaluation with LMDeploy
-
-We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass.
-
-## Setup
-
-### Install OpenCompass
-
-Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
-
-### Install LMDeploy
-
-Install lmdeploy via pip (python 3.8+)
-
-```shell
-pip install lmdeploy
-```
-
-## Evaluation
-
-OpenCompass integrates turbomind's python API for evaluation.
-
-We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`:
-
-```python
-from mmengine.config import read_base
-from opencompass.models.turbomind import TurboMindModel
-
-
-with read_base():
-    # choose a list of datasets
-    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a chosen format
-    from .summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-# config for internlm-20b model
-internlm_20b = dict(
-        type=TurboMindModel,
-        abbr='internlm-20b-turbomind',
-        path="internlm/internlm-20b",  # this path should be same as in huggingface
-        engine_config=dict(session_len=2048,
-                           max_batch_size=8,
-                           rope_scaling_factor=1.0),
-        gen_config=dict(top_k=1, top_p=0.8,
-                        temperature=1.0,
-                        max_new_tokens=100),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        concurrency=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<eoa>'
-    )
-
-models = [internlm_20b]
-```
-
-Then, in the home folder of OpenCompass, start evaluation by the following command:
-
-```shell
-python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
-```
-
-You are expected to get the evaluation results after the inference and evaluation.
-
-**Note**:
-
- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
-  and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.
--- a/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
+++ b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
@ -0,0 +1,86 @@
+# 使用 LMDeploy 加速评测
+
+我们支持在评测大语言模型时，使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案，拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。
+
+## 环境配置
+
+### 安装 OpenCompass
+
+请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
+
+### 安装 LMDeploy
+
+使用 pip 安装 LMDeploy (python 3.8+)：
+
+```shell
+pip install lmdeploy
+```
+
+LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
+
+```shell
+export LMDEPLOY_VERSION=0.6.0
+export PYTHON_VERSION=310
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## 评测
+
+在评测一个模型时，需要准备一份评测配置，指明评测集、模型和推理参数等信息。
+
+以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例，相关的配置信息如下：
+
+```python
+# configure the dataset
+from mmengine.config import read_base
+
+
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    # and output the results in a chosen format
+    from .summarizers.medium import summarizer
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# configure lmdeploy
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+
+# configure the model
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr=f'internlm2-chat-7b-lmdeploy',
+        # model path, which can be the address of a model repository on the Hugging Face Hub or a local path
+        path='internlm/internlm2-chat-7b',
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
+        # the max size of the context window
+        max_seq_len=7168,
+        # the max number of new tokens
+        max_out_len=1024,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+```
+
+把上述配置放在文件中，比如 "configs/eval_internlm2_lmdeploy.py"。然后，在 OpenCompass 的项目目录下，执行如下命令可得到评测结果：
+
+```shell
+python run.py configs/eval_internlm2_lmdeploy.py -w outputs
+```
--- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md
+++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
@ -1,75 +0,0 @@
-# 评测 LMDeploy 模型
-
-我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发，是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配，本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。
-
-## 环境配置
-
-### 安装 OpenCompass
-
-请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
-
-### 安装 LMDeploy
-
-使用 pip 安装 LMDeploy (python 3.8+)：
-
-```shell
-pip install lmdeploy
-```
-
-## 评测
-
-OpenCompass 支持分别通过 turbomind python API 评测数据集。
-
-下文以 InternLM-20B 模型为例，介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`:
-
-```python
-from mmengine.config import read_base
-from opencompass.models.turbomind import TurboMindModel
-
-
-with read_base():
-    # choose a list of datasets
-    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a chosen format
-    from .summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-# config for internlm-20b model
-internlm_20b = dict(
-        type=TurboMindModel,
-        abbr='internlm-20b-turbomind',
-        path="internlm/internlm-20b", # 注意路径与huggingface保持一致
-        engine_config=dict(session_len=2048,
-                           max_batch_size=8,
-                           rope_scaling_factor=1.0),
-        gen_config=dict(top_k=1, top_p=0.8,
-                        temperature=1.0,
-                        max_new_tokens=100),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        concurrency=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<eoa>'
-    )
-
-models = [internlm_20b]
-```
-
-然后，在 OpenCompass 的项目目录下，执行如下命令可得到评测结果：
-
-```shell
-python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
-```
-
-**注：**
-
- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
- 如果评测 InternLM Chat 模型，请使用配置文件 `eval_internlm_chat_turbomind.py`
- 如果评测 InternLM 7B 模型，请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。
--- a/opencompass/init.py
+++ b/opencompass/init.py
@ -1 +1 @@
-__version__ = '0.3.2.post1'
+__version__ = '0.3.3'
--- a/opencompass/configs/datasets/dingo/dingo_gen.py
+++ b/opencompass/configs/datasets/dingo/dingo_gen.py
@ -0,0 +1,34 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DingoDataset, DingoEvaluator
+
+
+dingo_paths = [
+    './data/dingo/en_192.csv',
+    './data/dingo/zh_170.csv',
+]
+
+dingo_datasets = []
+for path in dingo_paths:
+    dingo_reader_cfg = dict(input_columns='input', output_column=None)
+    dingo_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
+
+    dingo_datasets.append(
+        dict(
+            abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
+            type=DingoDataset,
+            path=path,
+            reader_cfg=dingo_reader_cfg,
+            infer_cfg=dingo_infer_cfg,
+            eval_cfg=dingo_eval_cfg,
+        ))
+
+datasets = dingo_datasets
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
@ -15,7 +15,7 @@ subjective_all_sets = [
 ]
 data_path ='data/subjective/followbench/converted_data'

-followbench_llmeval_dataset = []
+followbench_llmeval_datasets = []

 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
@ -48,7 +48,7 @@ for _name in subjective_all_sets:
        pred_role='BOT',
    )

-    followbench_llmeval_dataset.append(
+    followbench_llmeval_datasets.append(
        dict(
            abbr=f'{_name}',
            type=FollowBenchDataset,
--- a/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
+++ b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
@ -0,0 +1,73 @@
+import copy
+
+from opencompass.datasets import WikiBenchDataset
+from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+single_choice_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN',
+             prompt='问题: 白色念珠菌常被用作哪种生物的研究模式？\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 星期五广场（荷兰语：Vrijdagmarkt；荷兰语发音： ）是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施？\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数？\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
+        ),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 陈酆被任命为漳州刺史是因为什么原因？\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN',
+             prompt='问题: 丹徒县在1928年改名为什么？\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ]
+}
+
+wikibench_sets = {
+    'wiki': ['single_choice_cn'],
+}
+
+do_circular = True
+
+wikibench_datasets = []
+
+for _split in list(wikibench_sets.keys()):
+    for _name in wikibench_sets[_split]:
+        template = {}
+        for answer in ['A', 'B', 'C', 'D']:
+            one_template_round = copy.deepcopy(single_choice_prompts[_name])
+            one_template_round[-1]['prompt'] = one_template_round[-1][
+                'prompt'].format(answer=answer)
+            template[answer] = dict(round=one_template_round)
+        wikibench_infer_cfg = dict(
+            prompt_template=dict(type=PromptTemplate, template=template),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=PPLInferencer),
+        )
+        wikibench_eval_cfg = dict(evaluator=dict(
+            type=CircularEvaluator if do_circular else AccEvaluator), )
+        wikibench_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                path=f'./data/WikiBench/{_name}.jsonl',
+                name='circular_' + _name if do_circular else _name,
+                abbr='wikibench-' + _split + '-' + _name +
+                'circular' if do_circular else '',
+                reader_cfg=dict(
+                    input_columns=['question'],
+                    output_column='answer',
+                ),
+                infer_cfg=wikibench_infer_cfg,
+                eval_cfg=wikibench_eval_cfg,
+            ))
--- a/opencompass/configs/models/bailing_api/bailing-lite-0830.py
+++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py
@ -0,0 +1,30 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        path='Bailing-Lite-0830',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
+        },
+    ),
+]
--- a/opencompass/configs/models/bailing_api/bailing-pro-0920.py
+++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py
@ -0,0 +1,30 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        path='Bailing-Pro-0920',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
+        },
+    ),
+]
--- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
@ -1,15 +1,24 @@
 from opencompass.models import TurboMindModelwithChatTemplate

+
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
-        abbr='internlm2-chat-7b-turbomind',
+        abbr=f'internlm2-chat-7b-lmdeploy',
        path='internlm/internlm2-chat-7b',
-        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
-        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
        max_seq_len=8192,
        max_out_len=4096,
-        batch_size=16,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-1.5b-turbomind',
+        path='Qwen/Qwen2.5-1.5B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-7b-turbomind',
+        path='Qwen/Qwen2.5-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/opencompass/datasets/GaokaoBench.py
+++ b/opencompass/datasets/GaokaoBench.py
@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset):

    @staticmethod
    def load(path: str, name: str):
-        data = get_data_path(path, local_mode=True)
+        path = get_data_path(path, local_mode=True)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
            return MsDataset.load(path, subset_name=name, split='test')
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -33,6 +33,7 @@ from .crowspairs_cn import *  # noqa: F401, F403
 from .csl import *  # noqa: F401, F403
 from .custom import *  # noqa: F401, F403
 from .cvalues import *  # noqa: F401, F403
+from .dingo import *  # noqa: F401, F403
 from .drcd import *  # noqa: F401, F403
 from .drop import *  # noqa: F401, F403
 from .drop_simple_eval import *  # noqa: F401, F403
--- a/opencompass/datasets/dingo.py
+++ b/opencompass/datasets/dingo.py
@ -0,0 +1,84 @@
+# flake8: nodingo
+# yapf: disable
+import csv
+import json
+import os
+import time
+from typing import List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class DingoDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        raw_data = []
+        with open(path, encoding='utf-8') as f:
+            reader = csv.reader(f, delimiter=';')
+            for row in reader:
+                if len(row) < 1:
+                    row = ['']
+                raw_data.append({'input': row[0]})
+        return Dataset.from_list(raw_data)
+
+
+@LOAD_DATASET.register_module()
+class DingoLongDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        raw_data = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                raw_data.append({'input': json.loads(line).get('input')})
+        return Dataset.from_list(raw_data)
+
+
+@ICL_EVALUATORS.register_module()
+class DingoEvaluator(BaseEvaluator):
+
+    def score(self, origin_prompt: List, predictions: List) -> dict:
+        try:
+            # from dingo.model.model import Model
+            from dingo.exec import Executor
+            from dingo.io import InputArgs
+        except Exception:
+            raise ModuleNotFoundError(
+                '=========== '
+                'dingo register fail. please try: pip install dingo-python.'
+                ' ===========')
+
+        current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        file_data = [{'prompt': pmt, 'prediction': prd}
+                     for pmt, prd in zip(origin_prompt, predictions)]
+        file_name = 'dingo_file_' + current_time + '.jsonl'
+        with open(file_name, 'a', encoding='utf-8') as f:
+            for d in file_data:
+                json.dump(d, f, ensure_ascii=False)
+                f.write('\n')
+
+        input_data = {
+            'eval_models': ['llm_base'],
+            'input_path': file_name,
+            'output_path': './outputs/dingo/',
+            'dataset': 'local',
+            'datasource': 'local',
+            'data_format': 'jsonl',
+            'column_prompt': ['prompt'],
+            'column_content': ['prediction'],
+        }
+        # Model.apply_config(input_data["custom_config_path"])
+        input_args = InputArgs(**input_data)
+        executor = Executor.exec_map['local'](input_args)
+        result = executor.execute()
+        summary = result[0].to_dict()
+
+        os.remove(file_name)
+        return summary
--- a/opencompass/models/init.py
+++ b/opencompass/models/init.py
@ -3,6 +3,7 @@ from .ai360_api import AI360GPT  # noqa: F401
 from .alaya import AlayaLM  # noqa: F401
 from .baichuan_api import BaiChuan  # noqa: F401
 from .baidu_api import ERNIEBot  # noqa: F401
+from .bailing_api_oc import BailingAPI  # noqa: F401
 from .base import BaseModel, LMTemplateParser  # noqa: F401
 from .base_api import APITemplateParser, BaseAPIModel  # noqa: F401
 from .bytedance_api import ByteDance  # noqa: F401
@ -24,8 +25,6 @@ from .interntrain import InternTrain  # noqa: F401
 from .krgpt_api import KrGPT  # noqa: F401
 from .lightllm_api import LightllmAPI, LightllmChatAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401
-from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
-from .lmdeploy_tis import LmdeployTisModel  # noqa: F401
 from .minimax_api import MiniMax, MiniMaxChatCompletionV2  # noqa: F401
 from .mistral_api import Mistral  # noqa: F401
 from .mixtral import Mixtral  # noqa: F401
@ -40,7 +39,6 @@ from .rendu_api import Rendu  # noqa: F401
 from .sensetime_api import SenseTime  # noqa: F401
 from .stepfun_api import StepFun  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
-from .turbomind_tis import TurboMindTisModel  # noqa: F401
 from .turbomind_with_tf_above_v4_33 import \
    TurboMindModelwithChatTemplate  # noqa: F401
 from .unigpt_api import UniGPT  # noqa: F401
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@ -0,0 +1,225 @@
+import concurrent
+import concurrent.futures
+import os
+import socket
+import traceback
+from typing import Dict, List, Optional, Union
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.connection import HTTPConnection
+
+try:
+    from retrying import retry
+except ImportError:
+    retry = None
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class HTTPAdapterWithSocketOptions(HTTPAdapter):
+
+    def __init__(self, *args, **kwargs):
+        self._socket_options = HTTPConnection.default_socket_options + [
+            (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
+            (socket.SOL_TCP, socket.TCP_KEEPIDLE, 75),
+            (socket.SOL_TCP, socket.TCP_KEEPINTVL, 30),
+            (socket.SOL_TCP, socket.TCP_KEEPCNT, 120),
+        ]
+        super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs)
+
+    def init_poolmanager(self, *args, **kwargs):
+        if self._socket_options is not None:
+            kwargs['socket_options'] = self._socket_options
+        super(HTTPAdapterWithSocketOptions,
+              self).init_poolmanager(*args, **kwargs)
+
+
+class BailingAPI(BaseAPIModel):
+    """Model wrapper around Bailing Service.
+
+    Args:
+        ouput_key (str): key for prediction
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        generation_kwargs: other params
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        token: str,
+        url: str,
+        meta_template: Optional[Dict] = None,
+        query_per_second: int = 1,
+        retry: int = 3,
+        generation_kwargs: Dict = {},
+        max_seq_len=4096,
+    ):
+        super().__init__(
+            path=path,
+            max_seq_len=max_seq_len,
+            query_per_second=query_per_second,
+            meta_template=meta_template,
+            retry=retry,
+            generation_kwargs=generation_kwargs,
+        )
+
+        self.logger.info(f'Bailing API Model Init path: {path} url={url}')
+        if not token:
+            token = os.environ.get('BAILING_API_KEY')
+            if token:
+                self._headers = {'Authorization': f'Bearer {token}'}
+            else:
+                raise RuntimeError('There is not valid token.')
+        else:
+            self._headers = {'Authorization': f'Bearer {token}'}
+
+        self._headers['Content-Type'] = 'application/json'
+        self._url = url if url else \
+            'https://bailingchat.alipay.com/chat/completions'
+        self._model = path
+        self._sessions = []
+        self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
+                     if os.environ.get('BAILING_API_PARALLEL_NUM') else 1)
+        try:
+            for _ in range(self._num):
+                adapter = HTTPAdapterWithSocketOptions()
+                sess = requests.Session()
+                sess.mount('http://', adapter)
+                sess.mount('https://', adapter)
+                self._sessions.append(sess)
+        except Exception as e:
+            self.logger.error(f'Fail to setup the session. {e}')
+            raise e
+
+    def generate(
+        self,
+        inputs: Union[List[str], PromptList],
+        max_out_len: int = 4096,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (Union[List[str], PromptList]):
+                A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass' API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with concurrent.futures.ThreadPoolExecutor(
+                max_workers=self._num, ) as executor:
+            future_to_m = {
+                executor.submit(
+                    self._generate,
+                    self._sessions[i % self._num],
+                    input,
+                    max_out_len,
+                ): i
+                for i, input in enumerate(inputs)
+            }
+            results = []
+            for future in concurrent.futures.as_completed(future_to_m):
+                m = future_to_m[future]  # noqa F841
+                resp = future.result()
+                if resp and resp.status_code == 200:
+                    try:
+                        result = resp.json()
+                    except Exception as e:  # noqa F841
+                        results.append('')
+                    else:
+                        if (result.get('choices')
+                                and result['choices'][0].get('message')
+                                and result['choices'][0]['message'].get(
+                                    'content')):
+                            results.append(
+                                result['choices'][0]['message']['content'])
+                else:
+                    results.append('')
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        sess,
+        input: Union[str, PromptList],
+        max_out_len: int,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass' API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                content = item['prompt']
+                if not content:
+                    continue
+                message = {'content': content}
+                if item['role'] == 'HUMAN':
+                    message['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    message['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    message['role'] = 'system'
+                else:
+                    message['role'] = item['role']
+                messages.append(message)
+        request = {
+            'model':
+            self._model,
+            'messages':
+            messages,
+            'max_seq_len':
+            max(
+                max_out_len if max_out_len else 4096,
+                self.max_seq_len if self.max_seq_len else 4096,
+            ),
+        }
+        request.update(self.generation_kwargs)
+        try:
+            retry_num = 0
+            while retry_num < self.retry:
+                response = self._infer_result(request, sess)
+                if response.status_code == 200:
+                    break  # success
+                elif response.status_code == 426:
+                    retry_num += 1  # retry
+                else:
+                    raise ValueError(f'Status code = {response.status_code}')
+            else:
+                raise ValueError(
+                    f'Exceed the maximal retry times. Last status code '
+                    f'= {response.status_code}')
+        except Exception as e:
+            self.logger.error(f'Fail to inference request={request}; '
+                              f'model_name={self.path};  error={e}, '
+                              f'stack:{traceback.format_exc()}')
+            raise e
+        return response
+
+    # @retry(stop_max_attempt_number=3, wait_fixed=16000)  # ms
+    def _infer_result(self, request, sess):
+        response = sess.request(
+            'POST',
+            self._url,
+            json=request,
+            headers=self._headers,
+            timeout=500,
+        )
+        return response
--- a/opencompass/models/interntrain.py
+++ b/opencompass/models/interntrain.py
@ -79,6 +79,50 @@ class LegacyInternTrainManager(InternTrainManager):

@MODELS.register_module()
 class InternTrain(BaseModel):
+    """Model wrapper for InternTrain.
+
+    Args:
+        path (str): The name or path to HuggingFace's model.
+        module_path (str): Path of InternTrain repository.
+        max_seq_len (int): The maximum length of the input sequence. Defaults
+            to 2048.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        tokenizer_path (str): The path to the tokenizer. Defaults to None.
+        tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'.
+        model_config (str, dict, optional): Config of model. There are several
+            options for this parameter:
+
+                - filename (str): The config items are defined in a python file
+                  so the model will load configs from this file.
+                - config (dict): The configuration items are defined in a dict
+                  and the model will be initialized from ```model_config```.
+                - None: The config is loaded from ```path```. In this case,
+                  please make sure that ```path``` contains a config file named
+                  ``model_config.pt``.
+
+            Defaults to None.
+        model_type: Type of model. Defaults to 'InternTrain'
+        ckpt_type: The type of load function in InternTrain when checkpoints
+            are loaded. Defaults to None, which means load the checkpoint
+            directlywith pipeline merged.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        model_dtype: The model's dtype. If None, will use dtype defined in
+            ```model_config```. Defaults to None.
+        generation_kwargs (Dict, optional): The generation kwargs for the
+            model. Defaults to dict().
+        sync_rank (bool): Whether to sync inputs between ranks. Do not use this
+            if you are not familiar with this behavior. Check `sync_inputs`
+            function for more details. Defaults to False.
+        mode (str, optional): The method of input truncation when input length
+            exceeds max_seq_len. 'mid' represents the part of input to
+            truncate. Defaults to 'none'.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
+    """

    def __init__(self,
                 path: str,
@ -87,14 +131,15 @@ class InternTrain(BaseModel):
                 tokenizer_only: bool = False,
                 tokenizer_path: Optional[str] = None,
                 tokenizer_type: str = 'INTERNLM',
-                 model_config: Optional[str] = None,
+                 model_config: Optional[Union[str, Dict]] = None,
                 model_type: str = 'INTERNLM2',
                 ckpt_type: Optional[str] = None,
                 meta_template: Optional[Dict] = None,
                 model_dtype: Optional[str] = None,
                 generation_kwargs={},
                 sync_rank: bool = False,
-                 mode='none'):
+                 mode='none',
+                 end_str: Optional[str] = None):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         tokenizer_only=tokenizer_only,
@ -146,6 +191,7 @@ class InternTrain(BaseModel):
                                           bos_token_id=self.tokenizer.bos_id,
                                           pad_token_id=self.tokenizer.bos_id,
                                           eos_token_id=eos_token_ids)
+        self.end_str = end_str

    def _load_model(self,
                    path: str,
@ -242,7 +288,7 @@ class InternTrain(BaseModel):
        else:
            raise NotImplementedError(f'Unknown model dtype {model_dtype}')

-    def get_token_len(self, prompt: str) -> int:
+    def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int:
        """Get lengths of the tokenized strings.

        Args:
@ -251,7 +297,7 @@ class InternTrain(BaseModel):
        Returns:
            int: Length of the input tokens
        """
-        tokens = self.tokenizer(prompt, use_bos=True, use_eos=True)
+        tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos)
        return len(tokens)

    def generate(self,
@ -287,8 +333,10 @@ class InternTrain(BaseModel):
            max_length=tokens.shape[1] + max_out_len,
            **self.generation_kwargs)  # bsz, num_return_sequences, max_length
        outputs = outputs[:, 0, tokens.shape[1]:]
-        output_text = self.batch_decode(outputs,
-                                        stopping_criteria=stopping_criteria)
+        output_text = self.batch_decode(
+            outputs,
+            eos_token_ids=self.generator.eos_token_id,
+            stopping_criteria=stopping_criteria)

        return output_text

@ -343,7 +391,7 @@ class InternTrain(BaseModel):
            for input_text, cont in zip(input_texts, conts)
        ]
        replaced_lens = [
-            len(self.encode(input_text)[0]) for input_text in replaced_texts
+            self.get_token_len(input_text) for input_text in replaced_texts
        ]
        loglikelihoods = []
        for nloss, nlen, rlen in zip(loss, lens, replaced_lens):
@ -407,11 +455,22 @@ class InternTrain(BaseModel):

        return torch.LongTensor(tokens).cuda()

-    def batch_decode(self, outputs, stopping_criteria: List[str] = []):
+    def batch_decode(self,
+                     outputs,
+                     eos_token_ids: List[int],
+                     stopping_criteria: List[str] = []):
        # outputs: bsz, seq_len
        output_text = []
+        outputs = outputs.tolist()
        for output in outputs:
-            text = self.tokenizer.decode(output.tolist())
+            # cut off by eos_token_ids
+            eos_idx = len(output)
+            for eos_id in eos_token_ids:
+                if eos_id in output:
+                    eos_idx = min(output.index(eos_id), eos_idx)
+            text = self.tokenizer.decode(output[:eos_idx])
+            if self.end_str is not None:
+                text = text.split(self.end_str)[0]
            for stop_word in stopping_criteria:
                text = text.split(stop_word)[0]
            output_text.append(text)
--- a/opencompass/models/lmdeploy_pytorch.py
+++ b/opencompass/models/lmdeploy_pytorch.py
@ -1,188 +0,0 @@
-from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
-
-from opencompass.models.base import BaseModel
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-class LmdeployPytorchModel(BaseModel):
-    """Model wrapper for lmdeploy pytorch engine through python API.
-
-    Args:
-        path (str): path of the supported pytorch model.
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-        engine_config (Dict, optional): The engine config to set
-            arguments like session_len, max_batch_size for TurboMind.
-        gen_config (Dict, optional): Generation config to set
-                arguments like top_k, top_p, temperature.
-        end_str (str, optional): Whether to trim generated strings with end_str
-            if the model has special ending strings that are not handled well.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 path: str,
-                 concurrency: int = 8,
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 engine_config: Optional[Dict] = None,
-                 gen_config: Optional[Dict] = None,
-                 end_str: Optional[str] = None):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.pytorch import engine as tm
-        from lmdeploy.version import version_info
-
-        if engine_config is not None:
-            from lmdeploy.messages import PytorchEngineConfig
-            engine_config = PytorchEngineConfig(**engine_config)
-            # set thread_safe
-            if hasattr(engine_config, 'thread_safe'):
-                engine_config.thread_safe = True
-
-        if gen_config is not None:
-            from lmdeploy.messages import GenerationConfig
-            gen_config = GenerationConfig(**gen_config)
-
-        self.logger = get_logger()
-        tm_model = tm.Engine(path, engine_config)
-        self.tokenizer = tm_model.tokenizer
-        self.generators = [
-            tm_model.create_instance() for i in range(concurrency)
-        ]
-        self.generator_ids = [i + 1 for i in range(concurrency)]
-
-        from transformers import GenerationConfig
-        try:
-            generation_config = GenerationConfig.from_pretrained(path)
-        except Exception:
-            generation_config = None
-        if generation_config and hasattr(generation_config, 'eos_token_id'):
-            if gen_config.stop_words is None:
-                stop_words = []
-            if isinstance(generation_config.eos_token_id, int):
-                stop_words.append(generation_config.eos_token_id)
-            else:
-                assert isinstance(generation_config.eos_token_id, list)
-                for token_id in generation_config.eos_token_id:
-                    stop_words.append(token_id)
-            gen_config.stop_words = stop_words
-            if version_info >= (0, 6, 0):
-                gen_config.stop_token_ids = stop_words
-        self.gen_config = gen_config
-        self.end_str = end_str
-        self.major_version, self.minor_version = version_info[:2]
-
-    def generate(
-        self,
-        inputs: List[str],
-        max_out_len: int = 512,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[str]): A list of prompts
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-        assert isinstance(
-            inputs, List), f'List(str) is expected, but got {type(inputs)}'
-
-        # split inputs into batches
-        batch_size = len(self.generators)
-        batch_inputs = [
-            inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
-        ]
-
-        results = []
-        for batch_input in batch_inputs:
-            with ThreadPoolExecutor() as executor:
-                _results = list(
-                    executor.map(
-                        self._generate,
-                        self.generators[:len(batch_input)],
-                        self.generator_ids[:len(batch_input)],
-                        batch_input,
-                        [self.gen_config] * len(batch_input),
-                        [self.end_str] * len(batch_input),
-                    ))
-                results += _results
-        return results
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids = self.tokenizer.encode(prompt)
-        return len(input_ids)
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def _generate(self,
-                  generator,
-                  session_id,
-                  prompt: PromptType,
-                  gen_config=None,
-                  end_str: Optional[str] = None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            gen_config (GenerationConfig, optional): Generation
-                config to set arguments like top_k, top_p, temperature.
-            end_str (str, optional): Whether to trim generated strings
-                with end_str if the model has special ending strings
-                that are not handled well.
-                Defaults to None.
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt) is str, 'We only support string for TurboMind Python API'
-        input_ids = self.tokenizer.encode(prompt)
-        if self.major_version >= 0 and self.minor_version >= 4:
-            outputs = generator.infer(session_id,
-                                      input_ids,
-                                      gen_config=gen_config)
-            output_ids = outputs.token_ids
-        else:
-            _, output_ids, _ = generator.infer(session_id,
-                                               input_ids,
-                                               gen_config=gen_config)
-
-        # stop engine
-        if hasattr(generator, 'end'):
-            generator.end(session_id)
-        # decode output
-        response_all = self.tokenizer.decode(output_ids)
-        # trim output
-        if end_str:
-            response_all = response_all.split(end_str)[0]
-        # remove invalid characters
-        response_all = valid_str(response_all)
-        return response_all
--- a/opencompass/models/lmdeploy_tis.py
+++ b/opencompass/models/lmdeploy_tis.py
@ -1,200 +0,0 @@
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from queue import Queue
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from opencompass.models.base import BaseModel, LMTemplateParser
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-def prepare_tensor(name, input_tensor):
-    """Create grpcclient's InferInput instance according to a given tensor."""
-    import tritonclient.grpc as grpcclient
-    from tritonclient.utils import np_to_triton_dtype
-    t = grpcclient.InferInput(name, list(input_tensor.shape),
-                              np_to_triton_dtype(input_tensor.dtype))
-    t.set_data_from_numpy(input_tensor)
-    return t
-
-
-def stream_callback(que, result, error):
-    """callback function invoked by triton client."""
-    que.put((result, error))
-
-
-class LmdeployTisModel(BaseModel):
-    """Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC
-    API.
-
-    Args:
-        path (str): The name of OpenAI's model.
-        tis_addr (str): The address (ip:port format) of turbomind's
-            triton inference server
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-    """
-
-    is_api: bool = True
-
-    def __init__(self,
-                 path: str,
-                 tis_addr: str = '0.0.0.0:33337',
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 end_str: Optional[str] = None):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.tokenizer import Tokenizer
-
-        self.logger = get_logger()
-        self.template_parser = LMTemplateParser(meta_template)
-        self.eos_token_id = None
-        if meta_template and 'eos_token_id' in meta_template:
-            self.eos_token_id = meta_template['eos_token_id']
-        self.tis_addr = tis_addr
-        self.tokenizer = Tokenizer(path)
-        self.end_str = end_str
-
-    def generate(
-        self,
-        inputs: List[str or PromptList],
-        max_out_len: int = 512,
-        temperature: float = 1.0,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[str or PromptList]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic. Defaults to 0.7.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs),
-                             [temperature] * len(inputs),
-                             [self.end_str] * len(inputs)))
-        return results
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids = self.tokenizer.encode(prompt)
-        return len(input_ids)
-
-    def _call_triton_server(self, prompt, tis_addr, session_id,
-                            request_output_len, temperature, res_que):
-        import tritonclient.grpc as grpcclient
-
-        with grpcclient.InferenceServerClient(tis_addr) as client:
-            inputs = [
-                prepare_tensor('prompt',
-                               np.array([prompt.encode()], dtype=np.object_)),
-                prepare_tensor('max_tokens',
-                               np.array([request_output_len], dtype=np.int32)),
-                prepare_tensor('temperature',
-                               np.array([temperature], dtype=np.float_)),
-                prepare_tensor('top_p', np.array([1.0], dtype=np.float_)),
-                prepare_tensor('top_k', np.array([1], dtype=np.int32)),
-                prepare_tensor('ignore_eos', np.array([False],
-                                                      dtype=np.bool_)),
-                prepare_tensor('stream', np.array([True], dtype=np.bool_)),
-            ]
-
-            # async_stream
-            client.start_stream(partial(stream_callback, res_que))
-            client.async_stream_infer('lmdeploy_model',
-                                      inputs,
-                                      sequence_id=session_id,
-                                      sequence_start=True,
-                                      sequence_end=True)
-
-        res_que.put(None)
-        return
-
-    def _process_result(self, que):
-        text = ''
-        while True:
-            res = que.get()
-            if res is not None:
-                result, err = res
-                if err is not None:
-                    print(err)
-                else:
-                    res = result.as_numpy('response').item().decode()
-                    text += res
-            else:
-                return text
-
-    def _generate(self,
-                  prompt: str or PromptList,
-                  max_out_len: int,
-                  temperature: float,
-                  end_str: Optional[str] = None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (str or PromptList): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic.
-
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt
-        ) is str, 'We only support string for LMDeploy Python Backend TIS API'
-
-        res_que = Queue()
-
-        self._call_triton_server(prompt=prompt,
-                                 tis_addr=self.tis_addr,
-                                 session_id=threading.currentThread().ident,
-                                 request_output_len=max_out_len,
-                                 temperature=temperature,
-                                 res_que=res_que)
-        text = self._process_result(res_que)
-        response = valid_str(text)
-        if end_str:
-            response = response.split(end_str)[0]
-        return response
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -601,6 +601,10 @@ class OpenAISDK(OpenAI):
                if self.verbose:
                    self.logger.info(
                        'Successfully get response from OpenAI API')
+                    try:
+                        self.logger.info(responses)
+                    except Exception as e:  # noqa F841
+                        pass
                return responses.choices[0].message.content
            except Exception as e:
                self.logger.error(e)
--- a/opencompass/models/turbomind_tis.py
+++ b/opencompass/models/turbomind_tis.py
@ -1,135 +0,0 @@
-import logging
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
-
-from opencompass.models.base import BaseModel, LMTemplateParser
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-class TurboMindTisModel(BaseModel):
-    """Model wrapper for TurboMind Triton Inference Server gRPC API.
-
-    Args:
-        path (str): The name of OpenAI's model.
-        tis_addr (str): The address (ip:port format) of turbomind's
-            triton inference server
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-    """
-
-    is_api: bool = True
-
-    def __init__(
-        self,
-        path: str,
-        tis_addr: str = '0.0.0.0:33337',
-        max_seq_len: int = 2048,
-        meta_template: Optional[Dict] = None,
-    ):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.serve.turbomind.utils import Preprocessor
-        self.preprocess = Preprocessor(tis_addr)
-        self.logger = get_logger()
-        self.template_parser = LMTemplateParser(meta_template)
-        self.eos_token_id = None
-        if meta_template and 'eos_token_id' in meta_template:
-            self.eos_token_id = meta_template['eos_token_id']
-        self.tis_addr = tis_addr
-
-    def generate(
-        self,
-        inputs: List[PromptType],
-        max_out_len: int = 512,
-        temperature: float = 1.0,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[PromptType]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic. Defaults to 0.7.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs),
-                             [temperature] * len(inputs)))
-        return results
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids, _ = self.preprocess(prompt)
-        return input_ids.shape[-1]
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def _generate(self, prompt: PromptType, max_out_len: int,
-                  temperature: float) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic.
-
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt) is str, 'We only support string for TurboMind RPC API'
-
-        from lmdeploy.serve.turbomind.chatbot import Chatbot
-        chatbot = Chatbot(self.tis_addr,
-                          temperature=temperature,
-                          capability='completion',
-                          top_k=1,
-                          log_level=logging.ERROR)
-
-        for status, text, n_token in chatbot.stream_infer(
-                session_id=threading.currentThread().ident,
-                prompt=prompt,
-                request_output_len=max_out_len,
-                sequence_start=True,
-                sequence_end=True):
-            continue
-        response = valid_str(text)
-        response = response.replace('<eoa>', '')
-        return response
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@ -1,7 +1,6 @@
 # flake8: noqa
 # yapf: disable
 import copy
-from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union

 from opencompass.models.base import BaseModel
@ -31,38 +30,32 @@ class TurboMindModelwithChatTemplate(BaseModel):
        self,
        path: str,
        tokenizer_only: bool = False,
+        backend: str = 'turbomind',
        engine_config: Dict = {},
        gen_config: Dict = {},
-        concurrency: int = 8,
        max_seq_len: int = None,
        meta_template: Optional[Dict] = None,
        fastchat_template: Optional[str] = None,
        stop_words: List[str] = [],
    ):
-        from lmdeploy.messages import TurbomindEngineConfig
-        from lmdeploy.turbomind import TurboMind
-        from lmdeploy.version import version_info
-        from transformers import AutoTokenizer
-
        self.logger = get_logger()
        self.path = path
        self.tokenizer_only = tokenizer_only
        self.template_parser = _get_meta_template(meta_template)
        self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)

-        self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+        from lmdeploy import version_info
+        from transformers import AutoTokenizer
+        self.version_info = version_info
+        self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
        if not tokenizer_only:
            DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len}
            _engine_config = DEFAULT_ENGING_CONFIG.copy()
            _engine_config.update(engine_config)
-            engine_config = TurbomindEngineConfig(**_engine_config)
-            tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
-            self.tokenizer = tm_model.tokenizer
-        self.generators = [tm_model.create_instance() for i in range(concurrency)]
-        self.generator_ids = [i + 1 for i in range(concurrency)]
-        self.concurrency = concurrency
+            self.pipe = self._build_pipe(path, backend, _engine_config)
+        else:
+            self.pipe = None
        self.gen_config = gen_config
-        self.version_info = version_info
        self.fastchat_template = fastchat_template
        self.stop_words = list(set(stop_words + self._get_potential_stop_words(path)))
        self.logger.info(f'using stop words: {self.stop_words}')
@ -76,23 +69,23 @@ class TurboMindModelwithChatTemplate(BaseModel):
            generation_config = None
        if generation_config and hasattr(generation_config, 'eos_token_id'):
            if isinstance(generation_config.eos_token_id, int):
-                potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id))
+                potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
            else:
                assert isinstance(generation_config.eos_token_id, list)
                for token_id in generation_config.eos_token_id:
-                    potential_stop_words.append(self.origin_tokenizer.decode(token_id))
-        if self.origin_tokenizer.eos_token is not None:
-            potential_stop_words.append(self.origin_tokenizer.eos_token)
+                    potential_stop_words.append(self.tokenizer.decode(token_id))
+        if self.tokenizer.eos_token is not None:
+            potential_stop_words.append(self.tokenizer.eos_token)
        potential_stop_words = list(set(potential_stop_words))
        potential_stop_words = [s for s in potential_stop_words if s]
        return potential_stop_words

    def generate(self,
                 inputs: List[str],
-                 max_out_len: int = 512,
+                 max_out_len: int,
                 stopping_criteria: List[str] = [],
                 do_sample: Optional[bool] = None,
-                 temperature: int = 1,
+                 temperature: float = 1.0,
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.

@ -104,93 +97,45 @@ class TurboMindModelwithChatTemplate(BaseModel):
            List[str]: A list of generated strings.
        """
        assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}'
-
        messages = _convert_chat_messages(inputs)
        if self.fastchat_template:
            messages = _format_with_fast_chat_template(messages, self.fastchat_template)
        else:
-            messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
-
-        # split messages into batches
-        batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
+            messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]

        stop_words = list(set(self.stop_words + stopping_criteria))
-        encode_stop_words = []
-        if stop_words is not None and len(stop_words) > 0:
-            for words in stop_words:
-                encode_stop_words += self.tokenizer.encode(words, add_bos=False)

        DEFAULT_GEN_CONFIG = {
            'max_new_tokens': max_out_len,
            'min_new_tokens': 1,
-            'top_k': 1,
-            'stop_words': encode_stop_words,
+            'stop_words': stop_words,
        }

        gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
        gen_config.update(self.gen_config)
        if do_sample:
-            gen_config['top_k'] = 1000
+            gen_config['top_k'] = 40
            gen_config['temperature'] = temperature
+        else:
+            if self.version_info >= (0, 6, 0):
+                gen_config['do_sample'] = False
+            else:
+                gen_config['top_k'] = 1

-        from lmdeploy.messages import GenerationConfig
+        from lmdeploy import GenerationConfig
+        gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)}
        gen_config = GenerationConfig(**gen_config)
-        if self.version_info >= (0, 6, 0):
-            gen_config.stop_words = stop_words
-            gen_config.convert_stop_bad_words_to_ids(self.tokenizer)

        results = []
-        for batch_message in batch_messages:
-            n = len(batch_message)
-            with ThreadPoolExecutor() as executor:
-                _results = list(
-                    executor.map(
-                        self._generate,
-                        self.generators[:n],
-                        self.generator_ids[:n],
-                        batch_message,
-                        [gen_config] * n,
-                    ))
-                results += _results
+        outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
+        for output in outputs:
+            text = self.tokenizer.decode(output.token_ids)
+            results.append(text)

        for s in stop_words:
            results = [r.split(s)[0] for r in results]
        return results

-    def _generate(self,
-                  generator,
-                  session_id,
-                  prompt: PromptType,
-                  gen_config=None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            gen_config (GenerationConfig, optional): Generation
-                config to set arguments like top_k, top_p, temperature.
-        Returns:
-            str: The generated string.
-        """
-        assert type(prompt) is str, 'We only support string for TurboMind Python API'
-
-        input_ids = self.tokenizer.encode(prompt, add_bos=False)
-        for outputs in generator.stream_infer(session_id=session_id,
-                                              input_ids=[input_ids],
-                                              gen_config=gen_config,
-                                              sequence_start=True,
-                                              sequence_end=True,
-                                              step=0,
-                                              stream_output=False):
-            if self.version_info >= (0, 4, 0):
-                output_ids = outputs.token_ids
-            else:
-                _, output_ids, _ = outputs
-            response = self.tokenizer.decode(output_ids)
-            response = valid_str(response)
-        return response
-
    def get_token_len(self, prompt: str) -> int:
        """Get lengths of the tokenized strings.

@ -201,5 +146,20 @@ class TurboMindModelwithChatTemplate(BaseModel):
            int: Length of the input tokens
        """
        m = _convert_chat_messages([prompt])[0]
-        t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
+        t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
        return len(t['input_ids'])
+
+    def _build_pipe(self, model_path, backend, engine_config):
+        from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig,
+                              pipeline)
+
+        assert backend in ['pytorch', 'turbomind'], \
+                f'unsupported backend type: {backend}'
+
+        if backend == 'turbomind':
+            filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)}
+            backend_config = TurbomindEngineConfig(**filtered)
+        else:
+            filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)}
+            backend_config = PytorchEngineConfig(**filtered)
+        return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10)
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -232,6 +232,8 @@ class DLCRunner(BaseRunner):
                while True:
                    # 1. Avoid to request dlc too frequently.
                    # 2. DLC job may not be ready immediately after creation.
+                    dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10)
+                    time.sleep(dlc_sleep_time)
                    num_retry = 60
                    for retry_index in range(num_retry):
                        time.sleep(2)
--- a/opencompass/summarizers/subjective/init.py
+++ b/opencompass/summarizers/subjective/init.py
@ -4,6 +4,7 @@ from .all_obj import AllObjSummarizer
 from .alpacaeval import AlpacaSummarizer
 from .arenahard import ArenaHardSummarizer
 from .charm import CharmMemSummarizer
+from .common_summarizer import CommonSummarizer
 from .compass_arena import CompassArenaSummarizer
 from .compassbench import CompassBenchSummarizer
 from .corev2 import Corev2Summarizer
--- a/opencompass/summarizers/subjective/common_summarizer.py
+++ b/opencompass/summarizers/subjective/common_summarizer.py
@ -0,0 +1,146 @@
+# flake8: noqa
+# yapf: disable
+import csv
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+
+import numpy as np
+from mmengine import ConfigDict
+from tabulate import tabulate
+
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+
+from .compass_arena import CompassArenaSummarizer
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+
+def model_abbr_from_cfg_used_in_summarizer(model):
+    if model.get('summarizer_abbr', None):
+        return model['summarizer_abbr']
+    else:
+        return model_abbr_from_cfg(model)
+
+def post_process_single_rate(judgement: str):
+    """Input a string like below:
+
+    xxx[[5]]xxx, and extract the score
+    """
+    pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
+    matched_result = re.findall(pattern, judgement)
+    if matched_result:
+        score = float(matched_result[0])
+    else:
+        return None
+    return {'score': score}
+
+
+def get_capability_results(
+    judged_answers,
+    references,
+    fout,
+    fout_flag,
+    model_abbr,
+    judge_model_abbr,
+    dataset_abbr,
+):
+    capability_ratings = defaultdict(int)
+    capability_counts = defaultdict(int)
+    for ans, ref in zip(judged_answers, references):
+        capability_ratings['total'] += ans['score']
+        capability_counts['total'] += 1
+        capability_ratings[ref['capability']] += ans['score']
+        capability_counts[ref['capability']] += 1
+
+    capability_avg_ratings = defaultdict(float)
+
+    for capability, total_score in capability_ratings.items():
+        s = total_score / capability_counts[capability]
+        s = round(s, 2)
+        capability_avg_ratings[capability] = s
+    columns = list(capability_avg_ratings.keys())
+    columns.insert(0, columns.pop(columns.index('total')))
+
+    if fout_flag == 0:
+        with open(fout, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if fout_flag == 0:
+                writer.writerow(['model', 'judge_model', 'dataset'] + columns)
+            writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
+    else:
+        with open(fout, 'a+', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
+
+
+class CommonSummarizer(CompassArenaSummarizer):
+    """Do the subjectivity analyze based on evaluation results.
+
+    Args:
+        config (ConfigDict): The configuration object of the evaluation task.
+            It's expected to be filled out at runtime.
+    """
+
+    def __init__(self, config: ConfigDict, judge_type='single_rate') -> None:
+        self.judge_type = judge_type
+        self.tasks = []
+        self.cfg = config
+        self.judge_type = 'single_rate'
+        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
+        self.judge_model_cfgs = self.cfg['judge_models']
+        self.judge_map = {
+            'single_rate': post_process_single_rate
+        }
+        self.judge_function = self.judge_map[self.judge_type]
+
+    def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
+        """Summarize the subjectivity analysis based on evaluation results.
+
+        Args:
+            time_str (str): Timestamp for file naming.
+
+        Returns:
+            pd.DataFrame: The summary results.
+        """
+        if self.judge_type == 'pair':
+            return super().summarize()
+
+        # self.judge_type == 'single'
+        dataset_cfgs = self.cfg['datasets']
+        output_dir, results_folder = get_outdir(self.cfg, time_str)
+        fout_flag = 0
+        output_tmp_file = osp.join(output_dir, 'result.csv')
+        output_file = osp.join(output_dir, 'total_result.csv')
+        for eval_model_cfg in self.eval_model_cfgs:
+            for judge_model_cfg in self.judge_model_cfgs:
+                eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
+                show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
+                show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg)
+                judge_abbr = model_abbr_from_cfg(judge_model_cfg)
+                subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
+                if os.path.isdir(subdir_path):
+                    for dataset in dataset_cfgs:
+                        judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
+                        show_dataset_abbr = dataset_abbr_from_cfg(dataset)
+
+                        get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr)
+                        fout_flag += 1
+                else:
+                    print(subdir_path + ' is not exist! please check!')
+        with open(output_tmp_file, 'r') as f:
+            csv_reader = csv.reader(f)
+            header = next(csv_reader)
+            table = [line for line in csv_reader]
+
+            new_header = [''] + [line[0] for line in table]
+            new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
+            new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
+            t = tabulate(new_table, headers=new_header)
+        with open(output_file, 'a') as f:
+            f.write(','.join(new_header) + '\n')
+            for line in new_table:
+                f.write(','.join(map(str, line)) + '\n')
+            print(t)
+            print(output_file)
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -9,7 +9,7 @@ from mmengine.config import Config
 from opencompass.datasets.custom import make_custom_dataset_config
 from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel,
                                HuggingFaceCausalLM, HuggingFaceChatGLM3,
-                                HuggingFacewithChatTemplate, TurboMindModel,
+                                HuggingFacewithChatTemplate,
                                TurboMindModelwithChatTemplate,
                                VLLMwithChatTemplate)
 from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
@ -233,7 +233,7 @@ def change_accelerator(models, accelerator):
    model_accels = []
    for model in models:
        logger.info(f'Transforming {model["abbr"]} to {accelerator}')
-        # change HuggingFace model to VLLM or TurboMindModel
+        # change HuggingFace model to VLLM or LMDeploy
        if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']:
            gen_args = dict()
            if model.get('generation_kwargs') is not None:
@ -254,10 +254,10 @@ def change_accelerator(models, accelerator):

            if accelerator == 'lmdeploy':
                logger.info(f'Transforming {model["abbr"]} to {accelerator}')
-                mod = TurboMindModel
+                mod = TurboMindModelwithChatTemplate
                acc_model = dict(
                    type=f'{mod.__module__}.{mod.__name__}',
-                    abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
+                    abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
                    path=model['path'],
                    engine_config=dict(session_len=model['max_seq_len'],
                                       max_batch_size=model['batch_size'],
@ -270,7 +270,6 @@ def change_accelerator(models, accelerator):
                    max_out_len=model['max_out_len'],
                    max_seq_len=model['max_seq_len'],
                    batch_size=model['batch_size'],
-                    concurrency=model['batch_size'],
                    run_cfg=model['run_cfg'],
                )
                for item in ['meta_template']:
@ -312,7 +311,7 @@ def change_accelerator(models, accelerator):
                mod = TurboMindModelwithChatTemplate
                acc_model = dict(
                    type=f'{mod.__module__}.{mod.__name__}',
-                    abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
+                    abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
                    path=model['path'],
                    engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -1,6 +1,7 @@
 # Alpaca-eval
 alpaca-eval==0.6
 cn2an
+dingo-python
 # Icl topk retriever
 faiss_gpu==1.7.2
 # Humaneval, Humaneval X
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -23,6 +23,7 @@ python-Levenshtein
 rank_bm25==0.2.2
 rapidfuzz
 requests>=2.31.0
+retrying
 rich
 rouge
 -e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese