From a32f21a3569e20cad1469cd2bca1104b61953917 Mon Sep 17 00:00:00 2001
From: Fengzhe Zhou <zfz-960727@163.com>
Date: Fri, 28 Jun 2024 14:16:34 +0800
Subject: [PATCH] [Sync] Sync with internal codes 2024.06.28 (#1279)

---
 .gitignore                                    |   1 +
 .pre-commit-config-zh-cn.yaml                 |   1 +
 configs/dataset_collections/chat_OC15.py      |   4 +-
 .../datasets/CHARM/charm_reason_ppl_3da4de.py |  57 +++
 ...b4254.py => deprecated_apps_gen_5b4254.py} |   4 +-
 ...fbb95.py => deprecated_apps_gen_7fbb95.py} |   4 +-
 ...4dee3.py => deprecated_apps_gen_b4dee3.py} |   4 +-
 configs/datasets/collections/base_core.py     |   2 +-
 .../datasets/collections/base_medium_llama.py |   2 +-
 configs/datasets/collections/chat_core.py     |   4 +-
 .../datasets/collections/leaderboard/qwen.py  |   4 +-
 .../collections/leaderboard/qwen_chat.py      |   4 +-
 .../agent/cibench_template_gen_e6b12a.py      |  57 +++
 .../agent/mus_teval_gen_105c48.py             |  56 +++
 .../code/compassbench_v1_1_code_gen_986f01.py | 291 +++++++++++++++
 .../compassbench_v1_knowledge_gen_bd74e0.py   | 133 +++++++
 .../compassbench_v1_language_gen_7aa06d.py    |  46 +++
 .../math/compassbench_v1_1_math_gen_1dc21d.py |  81 ++++
 .../math/mathbench_prompt.py                  | 103 +++++
 .../compassbench_v1_reason_gen_d26d08.py      |  28 ++
 .../agent/cibench_template_gen_e6b12a.py      |  57 +++
 .../agent/mus_teval_gen_105c48.py             |  56 +++
 .../code/compassbench_v1_1_code_gen_986f01.py | 291 +++++++++++++++
 .../compassbench_v1_knowledge_gen_bd74e0.py   | 133 +++++++
 .../compassbench_v1_language_gen_7aa06d.py    |  46 +++
 .../math/compassbench_v1_1_math_gen_1dc21d.py |  81 ++++
 .../math/mathbench_prompt.py                  | 103 +++++
 .../compassbench_v1_reason_gen_d26d08.py      |  28 ++
 ...a9ed9.py => deprecated_drop_gen_8a9ed9.py} |   0
 .../{drop_gen_e18aaf.py => drop_examples.py}  |  43 +--
 configs/datasets/drop/drop_gen.py             |   2 +-
 configs/datasets/drop/drop_gen_a2697c.py      |  43 +++
 configs/datasets/drop/drop_gen_eb14af.py      |  34 ++
 .../drop_openai_simple_evals_gen_3857b0.py    |  34 ++
 .../gsm8k/gsm8k_0shot_v2_gen_a58960.py        |  37 ++
 configs/datasets/humaneval/README.md          |   2 +-
 ....py => deprecated_humaneval_gen_4a6eef.py} |   4 +-
 ....py => deprecated_humaneval_gen_6d1cc2.py} |   4 +-
 .../deprecated_humaneval_gen_8e312c.py        |  36 --
 ....py => deprecated_humaneval_gen_a82cae.py} |   4 +-
 ....py => deprecated_humaneval_gen_d2537e.py} |   4 +-
 ....py => deprecated_humaneval_gen_fd5822.py} |   4 +-
 ....py => deprecated_humaneval_gen_ff7054.py} |   4 +-
 .../humaneval/humaneval_gen_66a7f4.py         |  35 ++
 .../humaneval/humaneval_gen_6f294d.py         |  41 --
 .../humaneval/humaneval_gen_8e312c.py         |   5 +-
 ...umaneval_openai_sample_evals_gen_159614.py |   4 +-
 .../humaneval/humaneval_passk_gen_8e312c.py   |   4 +-
 .../humaneval_repeat10_gen_8e312c.py          |   4 +-
 .../humaneval_cn/humaneval_cn_gen_6313aa.py   |   4 +-
 .../humaneval_cn_passk_gen_6313aa.py          |   4 +-
 .../humaneval_cn_repeat10_gen_6313aa.py       |   4 +-
 .../humaneval_plus_gen_66a7f4.py              |  35 ++
 .../humaneval_plus_gen_8e312c.py              |   5 +-
 .../humaneval_plus_passk_gen_8e312c.py        |   4 +-
 .../humaneval_plus_repeat10_gen_8e312c.py     |   4 +-
 .../datasets/mmlu_pro/mmlu_pro_categories.py  |  16 +
 .../datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py  |  58 +++
 .../datasets/nq/nq_open_1shot_gen_2e45e5.py   |  61 +++
 .../alignbench_v1_1_judgeby_critiquellm.py    |  64 ++++
 .../compassbench/compassbench_compare.py      |   2 +-
 .../compassbench/compassbench_compare_v11.py  |  58 +++
 .../compassbench_compare_v11_patch.py         |  58 +++
 .../compassbench/compassbench_compare_v12.py  |  58 +++
 ...en_411572.py => taco_levels_gen_411572.py} |   0
 .../triviaqa_wiki_1shot_gen_bc5f21.py         |  62 +++
 configs/eval_hf_llama2.py                     |   2 +-
 configs/eval_internlm2_keyset.py              |   2 +-
 configs/eval_lightllm.py                      |   2 +-
 configs/eval_mixtral_8x7b.py                  |   8 -
 configs/eval_mmlu_pro.py                      |  34 ++
 configs/models/chatglm/hf_glm4_9b_chat.py     |  13 +
 configs/models/chatglm/vllm_chatglm3_6b.py    |  13 +
 configs/models/chatglm/vllm_glm4_9b_chat.py   |  14 +
 .../models/deepseek/hf_deepseek_v2_chat.py    |   2 +-
 .../models/deepseek/hf_deepseek_v2_lite.py    |  17 +
 .../deepseek/hf_deepseek_v2_lite_chat.py      |  17 +
 .../deepseek/lmdeploy_deepseek_67b_base.py    |  15 +
 .../deepseek/lmdeploy_deepseek_67b_chat.py    |  15 +
 .../deepseek/lmdeploy_deepseek_7b_base.py     |  15 +
 .../deepseek/lmdeploy_deepseek_7b_chat.py     |  15 +
 .../models/deepseek/vllm_deepseek_67b_chat.py |  13 +
 .../models/deepseek/vllm_deepseek_7b_chat.py  |  13 +
 .../deepseek/vllm_deepseek_moe_16b_base.py    |  15 +
 .../deepseek/vllm_deepseek_moe_16b_chat.py    |  13 +
 configs/models/gemma/vllm_gemma_2b.py         |  15 +
 configs/models/gemma/vllm_gemma_2b_it.py      |  14 +
 configs/models/gemma/vllm_gemma_7b.py         |  15 +
 configs/models/gemma/vllm_gemma_7b_it.py      |  14 +
 configs/models/hf_internlm/README.md          |   2 +-
 .../hf_internlm/lmdeploy_internlm2_1_8b.py    |  15 +
 .../hf_internlm/lmdeploy_internlm2_20b.py     |  26 +-
 .../hf_internlm/lmdeploy_internlm2_7b.py      |  15 +
 .../lmdeploy_internlm2_base_20b.py            |  15 +
 .../hf_internlm/lmdeploy_internlm2_base_7b.py |  15 +
 .../lmdeploy_internlm2_chat_1_8b.py           |  16 +-
 .../lmdeploy_internlm2_chat_1_8b_sft.py       |  15 +
 .../lmdeploy_internlm2_chat_20b.py            |  37 +-
 .../lmdeploy_internlm2_chat_20b_sft.py        |  15 +
 .../hf_internlm/lmdeploy_internlm2_chat_7b.py |  16 +-
 .../lmdeploy_internlm2_chat_7b_sft.py         |  15 +
 .../hf_internlm/lmdeploy_internlm_20b.py      |  15 +
 .../hf_internlm/lmdeploy_internlm_7b.py       |  15 +
 .../hf_internlm/lmdeploy_internlm_chat_20b.py |  15 +
 .../hf_internlm/lmdeploy_internlm_chat_7b.py  |  15 +
 .../hf_internlm/vllm_internlm2_chat_1_8b.py   |  13 +
 .../vllm_internlm2_chat_1_8b_sft.py           |  13 +
 .../hf_internlm/vllm_internlm2_chat_20b.py    |  13 +
 .../vllm_internlm2_chat_20b_sft.py            |  13 +
 .../hf_internlm/vllm_internlm2_chat_7b_sft.py |  13 +
 .../hf_internlm/vllm_internlm2_series.py      |  25 ++
 .../models/hf_llama/lmdeploy_llama2_13b.py    |  15 +
 .../hf_llama/lmdeploy_llama2_13b_chat.py      |  15 +
 .../models/hf_llama/lmdeploy_llama2_70b.py    |  15 +
 .../hf_llama/lmdeploy_llama2_70b_chat.py      |  15 +
 configs/models/hf_llama/lmdeploy_llama2_7b.py |  15 +
 .../hf_llama/lmdeploy_llama2_7b_chat.py       |  15 +
 .../models/hf_llama/lmdeploy_llama3_70b.py    |  15 +
 .../hf_llama/lmdeploy_llama3_70b_instruct.py  |  22 +-
 configs/models/hf_llama/lmdeploy_llama3_8b.py |  15 +
 .../hf_llama/lmdeploy_llama3_8b_instruct.py   |  22 +-
 configs/models/hf_llama/lmdeploy_llama_13b.py |  15 +
 configs/models/hf_llama/lmdeploy_llama_30b.py |  15 +
 configs/models/hf_llama/lmdeploy_llama_65b.py |  15 +
 configs/models/hf_llama/lmdeploy_llama_7b.py  |  15 +
 .../models/hf_llama/lmdeploy_llama_series.py  |  30 --
 configs/models/hf_llama/vllm_llama_series.py  |  29 ++
 .../models/mistral/lmdeploy_mistral_series.py |  24 --
 .../mistral/vllm_mistral_7b_instruct_v0_1.py  |  22 +-
 .../mistral/vllm_mistral_7b_instruct_v0_2.py  |  22 +-
 .../models/mistral/vllm_mistral_7b_v0_1.py    |  14 +-
 .../models/mistral/vllm_mistral_7b_v0_2.py    |  14 +-
 .../vllm_mixtral_8x22b_instruct_v0_1.py       |  15 +
 .../models/mistral/vllm_mixtral_8x22b_v0_1.py |  15 +
 .../vllm_mixtral_8x7b_instruct_v0_1.py        |  21 +-
 .../models/mistral/vllm_mixtral_8x7b_v0_1.py  |  15 +
 .../models/nanbeige/hf_nanbeige2_16b_chat.py  |  12 +
 configs/models/others/hf_telechat_12b_v2.py   |  26 ++
 configs/models/others/hf_telechat_52b.py      |  26 ++
 ..._telechat_7b_chat.py => hf_telechat_7b.py} |  14 +-
 configs/models/others/vllm_dbrx_instruct.py   |  14 +
 .../models/phi/hf_phi_3_medium_4k_instruct.py |  12 +
 .../models/phi/hf_phi_3_mini_4k_instruct.py   |  12 +
 .../models/phi/hf_phi_3_small_8k_instruct.py  |  12 +
 configs/models/qwen/README.md                 |   2 +-
 configs/models/qwen/hf_qwen1_5_0_5b_chat.py   |   1 +
 configs/models/qwen/hf_qwen1_5_110b_chat.py   |   1 +
 configs/models/qwen/hf_qwen1_5_14b_chat.py    |   1 +
 configs/models/qwen/hf_qwen1_5_1_8b_chat.py   |   1 +
 configs/models/qwen/hf_qwen1_5_32b_chat.py    |   1 +
 configs/models/qwen/hf_qwen1_5_4b_chat.py     |   1 +
 configs/models/qwen/hf_qwen1_5_72b_chat.py    |   1 +
 configs/models/qwen/hf_qwen1_5_7b_chat.py     |   1 +
 .../models/qwen/hf_qwen1_5_moe_a2_7b_chat.py  |   1 +
 configs/models/qwen/hf_qwen2_0_5b_instruct.py |  12 +
 configs/models/qwen/hf_qwen2_1_5b_instruct.py |  12 +
 ...2_moe_57b_a14b.py => hf_qwen2_57b_a14b.py} |   4 +-
 configs/models/qwen/hf_qwen2_7b_instruct.py   |  12 +
 configs/models/qwen/hf_qwen_14b_chat.py       |  24 +-
 configs/models/qwen/hf_qwen_1_8b_chat.py      |  24 +-
 configs/models/qwen/hf_qwen_72b_chat.py       |  24 +-
 configs/models/qwen/hf_qwen_7b_chat.py        |  26 +-
 configs/models/qwen/lmdeploy_qwen1_5_110b.py  |  15 +
 .../models/qwen/lmdeploy_qwen1_5_110b_chat.py |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_14b.py   |  15 +
 .../models/qwen/lmdeploy_qwen1_5_14b_chat.py  |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_1_8b.py  |  15 +
 .../models/qwen/lmdeploy_qwen1_5_1_8b_chat.py |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_32b.py   |  15 +
 .../models/qwen/lmdeploy_qwen1_5_32b_chat.py  |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_4b.py    |  15 +
 .../models/qwen/lmdeploy_qwen1_5_4b_chat.py   |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_72b.py   |  15 +
 .../models/qwen/lmdeploy_qwen1_5_72b_chat.py  |  16 +
 configs/models/qwen/lmdeploy_qwen1_5_7b.py    |  15 +
 .../models/qwen/lmdeploy_qwen1_5_7b_chat.py   |  16 +
 .../models/qwen/lmdeploy_qwen1_5_series.py    |  34 +-
 configs/models/qwen/lmdeploy_qwen2_1_5b.py    |  15 +
 .../qwen/lmdeploy_qwen2_1_5b_instruct.py      |  15 +
 configs/models/qwen/lmdeploy_qwen2_72b.py     |  15 +
 .../qwen/lmdeploy_qwen2_72b_instruct.py       |  15 +
 configs/models/qwen/lmdeploy_qwen2_7b.py      |  15 +
 .../models/qwen/lmdeploy_qwen2_7b_instruct.py |  15 +
 configs/models/qwen/lmdeploy_qwen2_series.py  |  26 ++
 configs/models/qwen/lmdeploy_qwen_14b.py      |  15 +
 configs/models/qwen/lmdeploy_qwen_14b_chat.py |  16 +
 configs/models/qwen/lmdeploy_qwen_1_8b.py     |  15 +
 .../models/qwen/lmdeploy_qwen_1_8b_chat.py    |  16 +
 configs/models/qwen/lmdeploy_qwen_72b.py      |  15 +
 configs/models/qwen/lmdeploy_qwen_72b_chat.py |  16 +
 configs/models/qwen/lmdeploy_qwen_7b.py       |  15 +
 configs/models/qwen/lmdeploy_qwen_7b_chat.py  |  16 +
 configs/models/qwen/lmdeploy_qwen_series.py   |   5 +-
 configs/models/qwen/vllm_qwen1_5_0_5b.py      |  15 +
 configs/models/qwen/vllm_qwen1_5_0_5b_chat.py |  14 +
 configs/models/qwen/vllm_qwen1_5_110b.py      |  15 +
 configs/models/qwen/vllm_qwen1_5_110b_chat.py |  14 +
 configs/models/qwen/vllm_qwen1_5_14b.py       |  15 +
 configs/models/qwen/vllm_qwen1_5_14b_chat.py  |  23 +-
 configs/models/qwen/vllm_qwen1_5_1_8b.py      |  15 +
 configs/models/qwen/vllm_qwen1_5_1_8b_chat.py |  14 +
 configs/models/qwen/vllm_qwen1_5_32b.py       |  15 +
 configs/models/qwen/vllm_qwen1_5_32b_chat.py  |  14 +
 configs/models/qwen/vllm_qwen1_5_4b.py        |  15 +
 configs/models/qwen/vllm_qwen1_5_4b_chat.py   |  14 +
 configs/models/qwen/vllm_qwen1_5_72b.py       |   9 +-
 configs/models/qwen/vllm_qwen1_5_72b_chat.py  |  21 +-
 configs/models/qwen/vllm_qwen1_5_7b.py        |  15 +
 configs/models/qwen/vllm_qwen1_5_7b_chat.py   |   3 +-
 configs/models/qwen/vllm_qwen1_5_moe_a2_7b.py |  15 +
 .../qwen/vllm_qwen1_5_moe_a2_7b_chat.py       |  14 +
 configs/models/qwen/vllm_qwen1_5_series.py    |  29 ++
 configs/models/qwen/vllm_qwen2_0_5b.py        |  15 +
 .../models/qwen/vllm_qwen2_0_5b_instruct.py   |  14 +
 configs/models/qwen/vllm_qwen2_1_5b.py        |  15 +
 .../models/qwen/vllm_qwen2_1_5b_instruct.py   |  14 +
 .../qwen/vllm_qwen2_57b_a14b_instruct.py      |  14 +
 configs/models/qwen/vllm_qwen2_72b.py         |  15 +
 .../models/qwen/vllm_qwen2_72b_instruct.py    |  14 +
 configs/models/qwen/vllm_qwen2_7b.py          |  15 +
 configs/models/qwen/vllm_qwen2_7b_instruct.py |  14 +
 configs/models/qwen/vllm_qwen2_series.py      |  25 ++
 configs/models/qwen/vllm_qwen_14b.py          |  15 +
 configs/models/qwen/vllm_qwen_14b_chat.py     |  23 +-
 configs/models/qwen/vllm_qwen_1_8b.py         |  15 +
 configs/models/qwen/vllm_qwen_1_8b_chat.py    |  14 +
 configs/models/qwen/vllm_qwen_72b.py          |  15 +
 configs/models/qwen/vllm_qwen_72b_chat.py     |  21 +-
 configs/models/qwen/vllm_qwen_7b.py           |  15 +
 configs/models/qwen/vllm_qwen_7b_chat.py      |  14 +
 configs/models/qwen/vllm_qwen_series.py       |  24 ++
 configs/summarizers/charm_reason.py           |   4 +-
 configs/summarizers/compass_knowledge.py      |  26 --
 configs/summarizers/compass_math.py           |  24 --
 .../compassbench_v1_1_objective.py            | 244 ++++++++++++
 .../compassbench_v1_1_objective_public.py     |  22 ++
 .../summarizers/compassbench_v1_language.py   |  53 ---
 configs/summarizers/compassbench_v1_reason.py |  28 --
 configs/summarizers/groups/mmlu_pro.py        |   5 +
 configs/summarizers/mmlu_pro.py               |  25 ++
 docs/en/user_guides/metrics.md                |   4 +-
 docs/zh_cn/user_guides/metrics.md             |   4 +-
 opencompass/cli/main.py                       |   5 +-
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/benbench.py              |  88 +++++
 opencompass/datasets/charm.py                 |   2 +-
 opencompass/datasets/humaneval.py             | 298 ++++++---------
 opencompass/datasets/mathbench.py             | 283 +++++++++++++-
 opencompass/datasets/mmlu_pro.py              |  31 ++
 opencompass/datasets/subjective/__init__.py   |   2 +
 .../compassbench_control_length_bias.py       | 130 +++++++
 opencompass/datasets/subjective/mtbench101.py |   1 +
 opencompass/datasets/taco.py                  |   3 +-
 opencompass/models/__init__.py                |   3 +-
 opencompass/models/ai360_api.py               |  14 +-
 opencompass/models/baichuan_api.py            | 214 +++--------
 opencompass/models/baidu_api.py               |   2 +
 opencompass/models/doubao.py                  | 110 ++++++
 opencompass/models/huggingface_above_v4_33.py |  45 ++-
 opencompass/models/hunyuan_api.py             |  78 ++--
 opencompass/models/minimax_api.py             |   5 +-
 opencompass/models/openai_api.py              |  10 +-
 opencompass/models/sensetime_api.py           |  31 +-
 .../models/turbomind_with_tf_above_v4_33.py   |  24 +-
 opencompass/models/vllm.py                    |  18 +-
 .../models/vllm_with_tf_above_v4_33.py        |  15 +-
 opencompass/models/xunfei_api.py              |  49 +--
 opencompass/models/yi_api.py                  |   6 +-
 .../openicl/icl_evaluator/lm_evaluator.py     | 103 +++--
 .../icl_inferencer/icl_gen_inferencer.py      |  21 ++
 opencompass/partitioners/num_worker.py        |   9 +-
 opencompass/runners/dlc.py                    |   8 +-
 opencompass/runners/local.py                  |  12 +-
 opencompass/summarizers/default.py            |  11 +-
 .../summarizers/subjective/__init__.py        |   1 +
 .../summarizers/subjective/compassbench.py    |  70 ++--
 .../summarizers/subjective/compassbench_th.py | 353 ++++++++++++++++++
 opencompass/summarizers/subjective/utils.py   |   3 +
 opencompass/tasks/openicl_infer.py            |   7 +-
 opencompass/tasks/subjective_eval.py          |   9 +
 opencompass/utils/text_postprocessors.py      |   3 +-
 requirements/runtime.txt                      |   2 +-
 tools/convert_alignmentbench.py               |   4 +-
 tools/prediction_merger.py                    |   8 +-
 284 files changed, 6233 insertions(+), 1290 deletions(-)
 create mode 100644 configs/datasets/CHARM/charm_reason_ppl_3da4de.py
 rename configs/datasets/apps/{apps_gen_5b4254.py => deprecated_apps_gen_5b4254.py} (87%)
 rename configs/datasets/apps/{apps_gen_7fbb95.py => deprecated_apps_gen_7fbb95.py} (89%)
 rename configs/datasets/apps/{apps_gen_b4dee3.py => deprecated_apps_gen_b4dee3.py} (86%)
 create mode 100644 configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/code/compassbench_v1_1_code_gen_986f01.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/language/compassbench_v1_language_gen_7aa06d.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/math/compassbench_v1_1_math_gen_1dc21d.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/math/mathbench_prompt.py
 create mode 100644 configs/datasets/compassbench_20_v1_1/reason/compassbench_v1_reason_gen_d26d08.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/agent/cibench_template_gen_e6b12a.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/agent/mus_teval_gen_105c48.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/code/compassbench_v1_1_code_gen_986f01.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/language/compassbench_v1_language_gen_7aa06d.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/math/compassbench_v1_1_math_gen_1dc21d.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/math/mathbench_prompt.py
 create mode 100644 configs/datasets/compassbench_20_v1_1_public/reason/compassbench_v1_reason_gen_d26d08.py
 rename configs/datasets/drop/{drop_gen_8a9ed9.py => deprecated_drop_gen_8a9ed9.py} (100%)
 rename configs/datasets/drop/{drop_gen_e18aaf.py => drop_examples.py} (78%)
 create mode 100644 configs/datasets/drop/drop_gen_a2697c.py
 create mode 100644 configs/datasets/drop/drop_gen_eb14af.py
 create mode 100644 configs/datasets/drop/drop_openai_simple_evals_gen_3857b0.py
 create mode 100644 configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py
 rename configs/datasets/humaneval/{humaneval_gen_4a6eef.py => deprecated_humaneval_gen_4a6eef.py} (90%)
 rename configs/datasets/humaneval/{humaneval_gen_6d1cc2.py => deprecated_humaneval_gen_6d1cc2.py} (90%)
 delete mode 100644 configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py
 rename configs/datasets/humaneval/{humaneval_gen_a82cae.py => deprecated_humaneval_gen_a82cae.py} (88%)
 rename configs/datasets/humaneval/{humaneval_gen_d2537e.py => deprecated_humaneval_gen_d2537e.py} (87%)
 rename configs/datasets/humaneval/{humaneval_gen_fd5822.py => deprecated_humaneval_gen_fd5822.py} (87%)
 rename configs/datasets/humaneval/{humaneval_gen_ff7054.py => deprecated_humaneval_gen_ff7054.py} (90%)
 create mode 100644 configs/datasets/humaneval/humaneval_gen_66a7f4.py
 delete mode 100644 configs/datasets/humaneval/humaneval_gen_6f294d.py
 create mode 100644 configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py
 create mode 100644 configs/datasets/mmlu_pro/mmlu_pro_categories.py
 create mode 100644 configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py
 create mode 100644 configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
 create mode 100644 configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
 create mode 100644 configs/datasets/subjective/compassbench/compassbench_compare_v11.py
 create mode 100644 configs/datasets/subjective/compassbench/compassbench_compare_v11_patch.py
 create mode 100644 configs/datasets/subjective/compassbench/compassbench_compare_v12.py
 rename configs/datasets/taco/{taco_staged_gen_411572.py => taco_levels_gen_411572.py} (100%)
 create mode 100644 configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py
 delete mode 100644 configs/eval_mixtral_8x7b.py
 create mode 100644 configs/eval_mmlu_pro.py
 create mode 100644 configs/models/chatglm/hf_glm4_9b_chat.py
 create mode 100644 configs/models/chatglm/vllm_chatglm3_6b.py
 create mode 100644 configs/models/chatglm/vllm_glm4_9b_chat.py
 create mode 100644 configs/models/deepseek/hf_deepseek_v2_lite.py
 create mode 100644 configs/models/deepseek/hf_deepseek_v2_lite_chat.py
 create mode 100644 configs/models/deepseek/lmdeploy_deepseek_67b_base.py
 create mode 100644 configs/models/deepseek/lmdeploy_deepseek_67b_chat.py
 create mode 100644 configs/models/deepseek/lmdeploy_deepseek_7b_base.py
 create mode 100644 configs/models/deepseek/lmdeploy_deepseek_7b_chat.py
 create mode 100644 configs/models/deepseek/vllm_deepseek_67b_chat.py
 create mode 100644 configs/models/deepseek/vllm_deepseek_7b_chat.py
 create mode 100644 configs/models/deepseek/vllm_deepseek_moe_16b_base.py
 create mode 100644 configs/models/deepseek/vllm_deepseek_moe_16b_chat.py
 create mode 100644 configs/models/gemma/vllm_gemma_2b.py
 create mode 100644 configs/models/gemma/vllm_gemma_2b_it.py
 create mode 100644 configs/models/gemma/vllm_gemma_7b.py
 create mode 100644 configs/models/gemma/vllm_gemma_7b_it.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_1_8b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_7b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_base_20b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_base_7b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b_sft.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_chat_20b_sft.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_chat_7b_sft.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm_20b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm_7b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py
 create mode 100644 configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_chat_1_8b.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_chat_1_8b_sft.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_chat_20b.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_chat_20b_sft.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_chat_7b_sft.py
 create mode 100644 configs/models/hf_internlm/vllm_internlm2_series.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_13b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_13b_chat.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_70b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_70b_chat.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_7b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama2_7b_chat.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama3_70b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama3_8b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama_13b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama_30b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama_65b.py
 create mode 100644 configs/models/hf_llama/lmdeploy_llama_7b.py
 delete mode 100644 configs/models/hf_llama/lmdeploy_llama_series.py
 create mode 100644 configs/models/hf_llama/vllm_llama_series.py
 delete mode 100644 configs/models/mistral/lmdeploy_mistral_series.py
 create mode 100644 configs/models/mistral/vllm_mixtral_8x22b_instruct_v0_1.py
 create mode 100644 configs/models/mistral/vllm_mixtral_8x22b_v0_1.py
 create mode 100644 configs/models/mistral/vllm_mixtral_8x7b_v0_1.py
 create mode 100644 configs/models/nanbeige/hf_nanbeige2_16b_chat.py
 create mode 100644 configs/models/others/hf_telechat_12b_v2.py
 create mode 100644 configs/models/others/hf_telechat_52b.py
 rename configs/models/others/{hf_telechat_7b_chat.py => hf_telechat_7b.py} (59%)
 create mode 100644 configs/models/others/vllm_dbrx_instruct.py
 create mode 100644 configs/models/phi/hf_phi_3_medium_4k_instruct.py
 create mode 100644 configs/models/phi/hf_phi_3_mini_4k_instruct.py
 create mode 100644 configs/models/phi/hf_phi_3_small_8k_instruct.py
 create mode 100644 configs/models/qwen/hf_qwen2_0_5b_instruct.py
 create mode 100644 configs/models/qwen/hf_qwen2_1_5b_instruct.py
 rename configs/models/qwen/{hf_qwen2_moe_57b_a14b.py => hf_qwen2_57b_a14b.py} (71%)
 create mode 100644 configs/models/qwen/hf_qwen2_7b_instruct.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_110b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_14b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_1_8b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_32b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_4b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_72b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_7b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_1_5b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_72b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_72b_instruct.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_7b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_7b_instruct.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen2_series.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_14b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_14b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_1_8b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_1_8b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_72b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_72b_chat.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_7b.py
 create mode 100644 configs/models/qwen/lmdeploy_qwen_7b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_0_5b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_0_5b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_110b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_110b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_14b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_1_8b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_1_8b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_32b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_32b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_4b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_4b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_7b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_moe_a2_7b.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_moe_a2_7b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen1_5_series.py
 create mode 100644 configs/models/qwen/vllm_qwen2_0_5b.py
 create mode 100644 configs/models/qwen/vllm_qwen2_0_5b_instruct.py
 create mode 100644 configs/models/qwen/vllm_qwen2_1_5b.py
 create mode 100644 configs/models/qwen/vllm_qwen2_1_5b_instruct.py
 create mode 100644 configs/models/qwen/vllm_qwen2_57b_a14b_instruct.py
 create mode 100644 configs/models/qwen/vllm_qwen2_72b.py
 create mode 100644 configs/models/qwen/vllm_qwen2_72b_instruct.py
 create mode 100644 configs/models/qwen/vllm_qwen2_7b.py
 create mode 100644 configs/models/qwen/vllm_qwen2_7b_instruct.py
 create mode 100644 configs/models/qwen/vllm_qwen2_series.py
 create mode 100644 configs/models/qwen/vllm_qwen_14b.py
 create mode 100644 configs/models/qwen/vllm_qwen_1_8b.py
 create mode 100644 configs/models/qwen/vllm_qwen_1_8b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen_72b.py
 create mode 100644 configs/models/qwen/vllm_qwen_7b.py
 create mode 100644 configs/models/qwen/vllm_qwen_7b_chat.py
 create mode 100644 configs/models/qwen/vllm_qwen_series.py
 delete mode 100644 configs/summarizers/compass_knowledge.py
 delete mode 100644 configs/summarizers/compass_math.py
 create mode 100644 configs/summarizers/compassbench_v1_1_objective.py
 create mode 100644 configs/summarizers/compassbench_v1_1_objective_public.py
 delete mode 100644 configs/summarizers/compassbench_v1_language.py
 delete mode 100644 configs/summarizers/compassbench_v1_reason.py
 create mode 100644 configs/summarizers/groups/mmlu_pro.py
 create mode 100644 configs/summarizers/mmlu_pro.py
 create mode 100644 opencompass/datasets/benbench.py
 create mode 100644 opencompass/datasets/mmlu_pro.py
 create mode 100644 opencompass/datasets/subjective/compassbench_control_length_bias.py
 create mode 100644 opencompass/models/doubao.py
 create mode 100644 opencompass/summarizers/subjective/compassbench_th.py

diff --git a/.gitignore b/.gitignore
index d4c9e2a6..8271d6c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,6 +102,7 @@ configs/sft_cfg/60B/*
 configs/sft_cfg/100B/*
 
 configs/cky/
+configs/_internal_legacy*
 # in case llama clone in the opencompass
 llama/
 
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
index e5225885..343d9111 100644
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@@ -35,6 +35,7 @@ repos:
         exclude: |
             (?x)^(
                 .*\.jsonl|
+                opencompass/datasets/subjective/mtbench101.py|
                 configs/
             )
   - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
diff --git a/configs/dataset_collections/chat_OC15.py b/configs/dataset_collections/chat_OC15.py
index 258a654f..ec879336 100644
--- a/configs/dataset_collections/chat_OC15.py
+++ b/configs/dataset_collections/chat_OC15.py
@@ -5,8 +5,8 @@ with read_base():
     from ..datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
     from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
     from ..datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
-    from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
-    from ..datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets
+    from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from ..datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
     from ..datasets.race.race_gen_69ee4f import race_datasets
     from ..datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
     from ..datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
diff --git a/configs/datasets/CHARM/charm_reason_ppl_3da4de.py b/configs/datasets/CHARM/charm_reason_ppl_3da4de.py
new file mode 100644
index 00000000..06bc2b10
--- /dev/null
+++ b/configs/datasets/CHARM/charm_reason_ppl_3da4de.py
@@ -0,0 +1,57 @@
+import os
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.datasets import CharmDataset
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+
+charm_tasks = [
+    ['Chinese_Anachronisms_Judgment', 'AB'],
+    ['Chinese_Movie_and_Music_Recommendation', 'ABCD'],
+    ['Chinese_Natural_Language_Inference', 'ABC'],
+    ['Chinese_Reading_Comprehension', 'ABCD'],
+    ['Chinese_Sequence_Understanding', 'ABCD'],
+    ['Chinese_Sport_Understanding', 'AB'],
+    ['Chinese_Time_Understanding', 'ABCD'],
+    ['Global_Anachronisms_Judgment', 'AB'],
+    ['Global_Movie_and_Music_Recommendation', 'ABCD'],
+    ['Global_Natural_Language_Inference', 'ABC'],
+    ['Global_Reading_Comprehension', 'ABCD'],
+    ['Global_Sequence_Understanding', 'ABCD'],
+    ['Global_Sport_Understanding', 'AB'],
+    ['Global_Time_Understanding', 'ABCDEF'],
+]
+
+charm_reason_datasets = []
+for task_name, options in charm_tasks:
+
+    with open(os.path.join(os.path.dirname(__file__), 'few-shot-examples', f'{task_name}_Direct.txt'), 'r') as f:
+        few_shot_example = f.read()
+
+    charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+    charm_reason_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={
+                f'({opt})': f'{few_shot_example}\n{{input}}\nA: {opt}' for opt in options
+            },
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=PPLInferencer),
+    )
+
+    charm_reason_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+
+    charm_reason_datasets.append(
+        dict(
+            type=CharmDataset,
+            abbr=f'charm-reason-{task_name}_Direct',
+            path=f'data/CHARM/reasoning',
+            name=task_name,
+            reader_cfg=charm_reason_reader_cfg,
+            infer_cfg=charm_reason_infer_cfg,
+            eval_cfg=charm_reason_eval_cfg,
+        )
+    )
diff --git a/configs/datasets/apps/apps_gen_5b4254.py b/configs/datasets/apps/deprecated_apps_gen_5b4254.py
similarity index 87%
rename from configs/datasets/apps/apps_gen_5b4254.py
rename to configs/datasets/apps/deprecated_apps_gen_5b4254.py
index 636d2a4b..7e6efd04 100644
--- a/configs/datasets/apps/apps_gen_5b4254.py
+++ b/configs/datasets/apps/deprecated_apps_gen_5b4254.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
 
 apps_reader_cfg = dict(
     input_columns=['question'], output_column='problem_id', train_split='test')
@@ -17,7 +17,7 @@ apps_infer_cfg = dict(
     inferencer=dict(type=GenInferencer))
 
 apps_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
diff --git a/configs/datasets/apps/apps_gen_7fbb95.py b/configs/datasets/apps/deprecated_apps_gen_7fbb95.py
similarity index 89%
rename from configs/datasets/apps/apps_gen_7fbb95.py
rename to configs/datasets/apps/deprecated_apps_gen_7fbb95.py
index 33244e0d..87043e86 100644
--- a/configs/datasets/apps/apps_gen_7fbb95.py
+++ b/configs/datasets/apps/deprecated_apps_gen_7fbb95.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
 
 apps_reader_cfg = dict(
     input_columns=['question'], output_column='problem_id', train_split='test')
@@ -24,7 +24,7 @@ apps_infer_cfg = dict(
     inferencer=dict(type=GenInferencer))
 
 apps_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
diff --git a/configs/datasets/apps/apps_gen_b4dee3.py b/configs/datasets/apps/deprecated_apps_gen_b4dee3.py
similarity index 86%
rename from configs/datasets/apps/apps_gen_b4dee3.py
rename to configs/datasets/apps/deprecated_apps_gen_b4dee3.py
index 0d018504..1f8b6773 100644
--- a/configs/datasets/apps/apps_gen_b4dee3.py
+++ b/configs/datasets/apps/deprecated_apps_gen_b4dee3.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
 
 apps_reader_cfg = dict(
     input_columns=['question'], output_column='problem_id', train_split='test')
@@ -15,7 +15,7 @@ apps_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 apps_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     k=[1, 10, 100],
     pred_postprocessor=dict(type=humaneval_postprocess),
 )
diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py
index 9df244f2..2e38abb0 100644
--- a/configs/datasets/collections/base_core.py
+++ b/configs/datasets/collections/base_core.py
@@ -14,7 +14,7 @@ with read_base():
     from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
     from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
     from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
-    from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
+    from ..humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
     from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
 
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
diff --git a/configs/datasets/collections/base_medium_llama.py b/configs/datasets/collections/base_medium_llama.py
index e36a51f5..b8d4b8ce 100644
--- a/configs/datasets/collections/base_medium_llama.py
+++ b/configs/datasets/collections/base_medium_llama.py
@@ -6,7 +6,7 @@ with read_base():
     from ..agieval.agieval_mixed_713d14 import agieval_datasets
     from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
     from ..bbh.bbh_gen_5b92b0 import bbh_datasets
-    from ..humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ..humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
     from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
     from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
diff --git a/configs/datasets/collections/chat_core.py b/configs/datasets/collections/chat_core.py
index a48ba9d9..b00b19da 100644
--- a/configs/datasets/collections/chat_core.py
+++ b/configs/datasets/collections/chat_core.py
@@ -5,8 +5,8 @@ with read_base():
     from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
     from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
     from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
-    from ..triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
-    from ..nq.nq_open_1shot_gen_01cf41 import nq_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_2e45e5 import nq_datasets
     from ..race.race_gen_69ee4f import race_datasets
     from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
     from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
diff --git a/configs/datasets/collections/leaderboard/qwen.py b/configs/datasets/collections/leaderboard/qwen.py
index fbaf660d..ac2679aa 100644
--- a/configs/datasets/collections/leaderboard/qwen.py
+++ b/configs/datasets/collections/leaderboard/qwen.py
@@ -42,8 +42,8 @@ with read_base():
     from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
     from ...math.math_gen_265cce import math_datasets
     from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from ...drop.drop_gen_8a9ed9 import drop_datasets
-    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
+    from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
     from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ...bbh.bbh_gen_5bf00b import bbh_datasets
 
diff --git a/configs/datasets/collections/leaderboard/qwen_chat.py b/configs/datasets/collections/leaderboard/qwen_chat.py
index 918a8a05..b6d7bc7c 100644
--- a/configs/datasets/collections/leaderboard/qwen_chat.py
+++ b/configs/datasets/collections/leaderboard/qwen_chat.py
@@ -42,8 +42,8 @@ with read_base():
     from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
     from ...math.math_gen_265cce import math_datasets
     from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from ...drop.drop_gen_8a9ed9 import drop_datasets
-    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ...drop.deprecated_drop_gen_8a9ed9 import drop_datasets
+    from ...humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
     from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ...bbh.bbh_gen_5b92b0 import bbh_datasets
 
diff --git a/configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py b/configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py
new file mode 100644
index 00000000..bee65612
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py
@@ -0,0 +1,57 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+
+libs = [
+    '/lightgbm',
+    '/matplotlib',
+    '/nltk',
+    '/opencv',
+    '/pandas',
+    '/pytorch',
+    '/scipy',
+    '/seaborn',
+    '/sklearn',
+    '/tensorflow',
+    '_chinese/lightgbm',
+    '_chinese/matplotlib',
+    '_chinese/nltk',
+    '_chinese/opencv',
+    '_chinese/pandas',
+    '_chinese/pytorch',
+    '_chinese/scipy',
+    '_chinese/seaborn',
+    '_chinese/sklearn',
+    '_chinese/tensorflow',
+]
+
+
+cibench_datasets = []
+for lib in libs:
+    cibench_reader_cfg = dict(
+        input_columns=['questions'], output_column='references', train_split='test', test_split='test'
+    )
+
+    cibench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template='{questions}',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=AgentInferencer, infer_mode='every'),
+    )
+
+    cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role='BOT')
+
+    cibench_datasets.append(
+        dict(
+            abbr=f'cibench_template{lib}',
+            type=CIBenchDataset,
+            path=f'data/compassbench_v1.1/agent-cibench/cibench_template{lib}',
+            internet_check=False,
+            reader_cfg=cibench_reader_cfg,
+            infer_cfg=cibench_infer_cfg,
+            eval_cfg=cibench_eval_cfg,
+        )
+    )
diff --git a/configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py b/configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py
new file mode 100644
index 00000000..65b4595b
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py
@@ -0,0 +1,56 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.openicl.icl_evaluator import TEvalEvaluator
+from opencompass.datasets import teval_postprocess, TEvalDataset
+
+plugin_eval_subject_mapping = {
+    'instruct': ['instruct_v1'],
+    'instruct_zh': ['instruct_v1_zh'],
+    'plan': ['plan_json_v1', 'plan_str_v1'],
+    'plan_zh': ['plan_json_v1_zh', 'plan_str_v1_zh'],
+    'review': ['review_str_v1'],
+    'review_zh': ['review_str_v1_zh'],
+    'reason_retrieve_understand': ['reason_retrieve_understand_json_v1'],
+    'reason_retrieve_understand_zh': ['reason_retrieve_understand_json_v1_zh'],
+    'reason': ['reason_str_v1'],
+    'reason_zh': ['reason_str_v1_zh'],
+    'retrieve': ['retrieve_str_v1'],
+    'retrieve_zh': ['retrieve_str_v1_zh'],
+    'understand': ['understand_str_v1'],
+    'understand_zh': ['understand_str_v1_zh'],
+}
+
+plugin_eval_datasets = []
+for _name in plugin_eval_subject_mapping:
+    plugin_eval_reader_cfg = dict(input_columns=['prompt'], output_column='ground_truth')
+    plugin_eval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=ChatInferencer),
+    )
+    plugin_eval_eval_cfg = dict(
+        evaluator=dict(type=TEvalEvaluator, subset=_name),
+        pred_postprocessor=dict(type=teval_postprocess),
+        num_gpus=1,
+    )
+
+    for subset in plugin_eval_subject_mapping[_name]:
+        plugin_eval_datasets.append(
+            dict(
+                abbr='plugin_eval-mus-p10-' + subset,
+                type=TEvalDataset,
+                path='data/compassbench_v1.1/agent-teval-p10',
+                name=subset,
+                reader_cfg=plugin_eval_reader_cfg,
+                infer_cfg=plugin_eval_infer_cfg,
+                eval_cfg=plugin_eval_eval_cfg,
+            )
+        )
diff --git a/configs/datasets/compassbench_20_v1_1/code/compassbench_v1_1_code_gen_986f01.py b/configs/datasets/compassbench_20_v1_1/code/compassbench_v1_1_code_gen_986f01.py
new file mode 100644
index 00000000..07908fa2
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/code/compassbench_v1_1_code_gen_986f01.py
@@ -0,0 +1,291 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import MBPPDataset, SanitizedMBPPDataset, MBPPEvaluator
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+from opencompass.datasets import TACODataset, TACOEvaluator
+
+compassbench_v1_1_code_datasets = []
+
+# --------------------------------------------------------------- HumanEval CN ---------------------------------------------------------------
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='完成以下Python代码任务:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        abbr='openai_humaneval_cn',
+        type=HumanevalDataset,
+        path='data/compassbench_v1.1/code/humaneval_cn/human-eval-cn-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- HumanEval Plus ---------------------------------------------------------------
+humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Complete the following python code:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='data/compassbench_v1.1/code/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- MBPP CN ---------------------------------------------------------------
+mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个函数，从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试：\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试：\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个函数，使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试：\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp_cn',
+        path='data/compassbench_v1.1/code/mbpp_cn/mbpp_cn.jsonl',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- Sanitized MBPP ---------------------------------------------------------------
+sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr='sanitized_mbpp',
+        path='data/compassbench_v1.1/code/mbpp/sanitized-mbpp.jsonl',
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- HumanevalX ---------------------------------------------------------------
+humanevalx_reader_cfg = dict(input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+humanevalx_eval_cfg_dict = {
+    lang: dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001,
+        ),  # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
+    )
+    for lang in ['python', 'cpp', 'go', 'java', 'js']  # do not support rust now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+for lang in ['python', 'cpp', 'go', 'java', 'js']:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=HumanevalXDataset,
+            abbr=f'humanevalx-{lang}',
+            language=lang,
+            path='data/compassbench_v1.1/code/humanevalx',
+            reader_cfg=humanevalx_reader_cfg,
+            infer_cfg=humanevalx_infer_cfg,
+            eval_cfg=humanevalx_eval_cfg_dict[lang],
+        )
+    )
+
+# --------------------------------------------------------------- LCBench ---------------------------------------------------------------
+LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
+LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 ["a","b","c"]，我们需要按一次键来输入 "a"，按两次键来输入 "b"，按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+for difficulty in LC_difficulties_list:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_en-' + difficulty,
+            path='data/compassbench_v1.1/code/LCBench2023/LCBench2023.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_en_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_cn-' + difficulty,
+            path='data/compassbench_v1.1/code/LCBench2023/LCBench2023_cn.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_cn_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+
+
+# --------------------------------------------------------------- TACO ---------------------------------------------------------------
+TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
+TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
+
+TACO_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
+
+for difficulty in TACO_difficulties_list:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=TACODataset,
+            abbr='TACO-' + difficulty,
+            path='data/compassbench_v1.1/code/BAAI-TACO',
+            difficulty=difficulty,
+            reader_cfg=TACO_reader_cfg,
+            infer_cfg=TACO_infer_cfg,
+            eval_cfg=TACO_eval_cfg,
+        )
+    )
diff --git a/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py b/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
new file mode 100644
index 00000000..c3163212
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
@@ -0,0 +1,133 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import WikiBenchDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+# ----------------------- Prompt Content----------------------- #
+zero_shot_prompts = {
+    'single_choice_prompts': [
+        dict(role='HUMAN', prompt='以下是一道单项选择题，请你根据你了解的知识给出正确的答案选项。\n下面是你要回答的题目：：\n{question}\n答案选项：'),
+        dict(role='BOT', prompt='{answer}')
+    ]
+}
+few_shot_prompts = {
+    'single_choice_prompts': {
+        'single_choice_cn': [
+            dict(role='HUMAN', prompt='题目：“一丝不苟”中的“丝”的本意是（  ）。\nA. 计量单位\nB. 丝线\nC. 丝绸\nD. 发丝'),
+            dict(role='BOT', prompt='答案：A'),
+            dict(role='HUMAN', prompt='题目：五华县体育场位于哪个省？\nA. 湖南省\nB. 浙江省\nC. 广东省\nD. 江苏省'),
+            dict(role='BOT', prompt='答案：C'),
+            dict(role='HUMAN', prompt='题目：“西施犬的原产地是哪里？\nA. 印度\nB. 中国\nC. 西藏\nD. 台湾'),
+            dict(role='BOT', prompt='答案：C'),
+            dict(role='HUMAN', prompt='题目：四库全书的四库是指什么？\nA. 易、书、诗、礼\nB. 经、史、子、音\nC. 诗、书、音、律\nD. 经、史、子、集'),
+            dict(role='BOT', prompt='答案：D'),
+            dict(role='HUMAN', prompt='题目：{question}'),
+        ]}
+}
+
+
+# ----------------------- Prompt Template----------------------- #
+
+# Use Zero-Shot or not
+with_few_shot = True
+
+# Max for this dataset is 4, should be set with `with_few_shot`
+few_shot_samples = 4
+
+# Use circular evaluation or not
+with_circular_eval = True
+
+single_choice_prompts = zero_shot_prompts['single_choice_prompts'] if not with_few_shot else few_shot_prompts['single_choice_prompts']
+
+# Set few shot prompt number
+if with_few_shot:
+    assert few_shot_samples > 0
+    for _name in list(single_choice_prompts.keys()):
+        single_choice_prompts[_name] = single_choice_prompts[_name][- few_shot_samples * 2 - 2:]
+
+compassbench_v1_knowledge_sets = {
+    'common_knowledge': ['single_choice_cn'],
+    'humanity': ['single_choice_cn'],
+    'natural_science': ['single_choice_cn'],
+    'social_science': ['single_choice_cn'],
+}
+
+
+# ----------------------- Dataset Config----------------------- #
+compassbench_v1_knowledge_datasets = []
+
+for _split in list(compassbench_v1_knowledge_sets.keys()):
+    for _name in compassbench_v1_knowledge_sets[_split]:
+        compassbench_v1_knowledge_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+        compassbench_v1_knowledge_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=single_choice_prompts[_name]
+                ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        )
+        compassbench_v1_knowledge_eval_cfg = dict(
+            evaluator=dict(type=CircularEvaluator if with_circular_eval else AccEvaluator),
+            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+        )
+
+        compassbench_v1_knowledge_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                abbr='compassbench_v1_knowledge-' + _split + '-' + _name + '_' + 'circular' if with_circular_eval else '',
+                path=f'data/compassbench_v1.1/knowledge/{_split}/{_name}.jsonl',
+                name=_name + '_circular' if with_circular_eval else _name,
+                reader_cfg=compassbench_v1_knowledge_reader_cfg,
+                infer_cfg=compassbench_v1_knowledge_infer_cfg,
+                eval_cfg=compassbench_v1_knowledge_eval_cfg,
+            )
+        )
+
+
+from opencompass.datasets import TriviaQADataset_V3, TriviaQAEvaluator
+
+triviaqa_and_nq_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+triviaqa_and_nq_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: When do stores stop selling alcohol in indiana?'),
+                dict(role='BOT', prompt='Answer: 3 a.m.'),
+                dict(role='HUMAN', prompt='Question: Edinburgh of the Seven Seas is the capital of which group of islands?'),
+                dict(role='BOT', prompt='Answer: Tristan da Cunha'),
+                dict(role='HUMAN', prompt='Question: Which book of the Christian Bible\'s new testament comprises a letter from St Paul to members of a church that he had founded at Macedonia?'),
+                dict(role='BOT', prompt='Answer: Philippians'),
+                dict(role='HUMAN', prompt='Question: The Hindu deity Hanuman appears in the form of which animal?'),
+                dict(role='BOT', prompt='Answer: A monkey'),
+                dict(role='HUMAN', prompt='Question: Who hosts the ITV quiz show The Chase?'),
+                dict(role='BOT', prompt='Answer: Bradley Walsh'),
+                dict(role='HUMAN', prompt='Question: {question}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=['Question:']),
+)
+
+triviaqa_and_nq_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
+
+compassbench_v1_knowledge_datasets.append(
+    dict(
+        type=TriviaQADataset_V3,
+        abbr='compassbench_v1_knowledge-mixed-cloze_en',
+        path='data/compassbench_v1.1/knowledge/mixed/cloze_en.jsonl',
+        reader_cfg=triviaqa_and_nq_reader_cfg,
+        infer_cfg=triviaqa_and_nq_infer_cfg,
+        eval_cfg=triviaqa_and_nq_eval_cfg
+    )
+)
diff --git a/configs/datasets/compassbench_20_v1_1/language/compassbench_v1_language_gen_7aa06d.py b/configs/datasets/compassbench_20_v1_1/language/compassbench_v1_language_gen_7aa06d.py
new file mode 100644
index 00000000..f3d172fb
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/language/compassbench_v1_language_gen_7aa06d.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_evaluator import EMEvaluator, BleuFloresEvaluator, RougeEvaluator, JiebaRougeEvaluator
+
+compassbench_v1_language_datasets = [
+     # dict(path='data/compassbench_v1.1/language/information_retrieval_en.jsonl',
+     #      abbr='information_retrieval_en', data_type= 'qa', infer_method='gen', output_column='answers',
+     #      human_prompt='{context}\n{origin_question}\nConcise and direct answer is',
+     #      evaluator=EMEvaluator),
+     # dict(path='data/compassbench_v1.1/language/information_retrieval_zh.jsonl',
+     #      abbr='information_retrieval_zh', data_type= 'qa', infer_method='gen', output_column='answers',
+     #      human_prompt='{context}\n{origin_question}\n简单直接的回答是',
+     #      evaluator=EMEvaluator),
+
+     dict(path='data/compassbench_v1.1/language/intention_recognition_en.jsonl',
+          abbr='intention_recognition_en_circular', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1/language/intention_recognition_zh.jsonl',
+          abbr='intention_recognition_zh_circular', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1/language/sentiment_analysis_en.jsonl',
+          abbr='sentiment_analysis_en_circular', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1/language/sentiment_analysis_zh.jsonl',
+          abbr='sentiment_analysis_zh_circular', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1/language/translation.jsonl',
+          abbr='translation', data_type= 'qa', infer_method='gen',
+          evaluator=BleuFloresEvaluator),
+
+     dict(path='data/compassbench_v1.1/language/content_critic_en.jsonl',
+          abbr='content_critic_en_circular', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1/language/content_critic_zh.jsonl',
+          abbr='content_critic_zh_circular', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1/language/content_summarization_en.jsonl',
+          abbr='content_summarization_en', data_type= 'qa', infer_method='gen', output_column='summary',
+          human_prompt='{article}\nSummary of the article is:\n',
+          evaluator=RougeEvaluator),
+     dict(path='data/compassbench_v1.1/language/content_summarization_zh.jsonl',
+          abbr='content_summarization_zh', data_type= 'qa', infer_method='gen', output_column='summary',
+          human_prompt='{article}\n上述内容摘要如下：\n',
+          evaluator=JiebaRougeEvaluator),
+
+     dict(path='data/compassbench_v1.1/language/traditional_cultural_understanding_zh.jsonl',
+          abbr='traditional_cultural_understanding_zh_circular', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1/language/chinese_semantic_understanding_zh.jsonl',
+          abbr='chinese_semantic_understanding_zh_circular', data_type='circular-mcq', infer_method='gen'),
+]
diff --git a/configs/datasets/compassbench_20_v1_1/math/compassbench_v1_1_math_gen_1dc21d.py b/configs/datasets/compassbench_20_v1_1/math/compassbench_v1_1_math_gen_1dc21d.py
new file mode 100644
index 00000000..eea7722d
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/math/compassbench_v1_1_math_gen_1dc21d.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+compassbench_v1_1_math_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=2048),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        compassbench_v1_1_math_datasets.append(
+            dict(
+                abbr='mathbench-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/compassbench_v1.1/math/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
diff --git a/configs/datasets/compassbench_20_v1_1/math/mathbench_prompt.py b/configs/datasets/compassbench_20_v1_1/math/mathbench_prompt.py
new file mode 100644
index 00000000..069528ee
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/math/mathbench_prompt.py
@@ -0,0 +1,103 @@
+zero_shot_prompts = {
+    'single_choice_cn_with_reasoning': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：'),
+    ],
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n所以答案是：'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
+    ],
+}
+
+few_shot_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: A'),
+        dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2，高为4.一个圆柱的下底面在圆锥的底面上，上底面的圆周在圆锥的侧面上，当圆柱侧面积为$4 \\pi$时，该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_cn_with_reasoning' : [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: 因为首先，我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$，以便消去分母中的虚数部分：$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$，这样做的目的是利用复数乘法的性质，从而消去分母中的虚数部分。我们进行乘法运算：$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$，$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$，由于$i^2 = -1$，我们可以进一步简化：$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$，在复平面上，这个点位于第四象限，因为它的实部是正的（$\\frac{1}{2}$），而虚部是负的（$-\\frac{1}{2}$）\n答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$（其中$\\omega>0$）的图像向左平移2个单位长度后，得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合，我们需要满足以下条件：\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性，我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$，其中$k$是任意整数。因此，我们可以写出：\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程，我们得到：$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$，我们需要找到满足条件的最小正整数$k$。当$k=0$时，我们得到$\\omega=2$，这是满足条件的最小正值。因此，最小值等于2。答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: 我们可以将这个方程写成：$a + 2ai + b = 2i$，然后将实部和虚部分开：\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$，我们可以得到$a = -b$。将这个值代入$2a = 2$，我们得到$2(-b) = 2$，因此$b = -1$。因此，$a = 1$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4．则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
+        dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们，如果一个三角形的两条边长分别为$a$和$b$，那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中，$a = 3$，$b = 4$，因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: B'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'cloze_cn': [
+        dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？'),
+        dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9'),
+        dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？'),
+        dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29'),
+        dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？'),
+        dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33'),
+        dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？'),
+        dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'cloze_en': [
+        dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
+        dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
+        dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
+        dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
+        dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
+        dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
+        dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
+        dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+}
+
+mathbench_sets = {
+    # Practice Part
+    'college': ['single_choice_cn', 'single_choice_en'],
+    'high': ['single_choice_cn', 'single_choice_en'],
+    'middle': ['single_choice_cn', 'single_choice_en'],
+    'primary': ['cloze_cn', 'cloze_en'],
+    'arithmetic': ['cloze_en'],
+    # Theory part
+    'college_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'high_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
+}
diff --git a/configs/datasets/compassbench_20_v1_1/reason/compassbench_v1_reason_gen_d26d08.py b/configs/datasets/compassbench_20_v1_1/reason/compassbench_v1_reason_gen_d26d08.py
new file mode 100644
index 00000000..2255392c
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1/reason/compassbench_v1_reason_gen_d26d08.py
@@ -0,0 +1,28 @@
+compassbench_v1_reason_datasets = [
+    dict(path='data/compassbench_v1.1/reason/CN_Commonsense.jsonl',
+         abbr='reasonbench_cn_commonsense_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/CN_Abductive_Alphanlg_translated.jsonl',
+         abbr='reasonbench_cn_abductive_alphanlg_translated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/CN_Deductive_BBH3obj_translated.jsonl',
+         abbr='reasonbench_cn_deductive_bbh3obj_translated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/CN_Deductive_LogiQA_zh.jsonl',
+         abbr='reasonbench_cn_deductive_logiqa_zh_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/CN_Inductive_DEER_translated.jsonl',
+         abbr='reasonbench_cn_inductive_deer_translated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/CN_Inductive_selfgenerated.jsonl',
+         abbr='reasonbench_cn_inductive_selfgenerated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Commonsense.jsonl',
+         abbr='reasonbench_en_commonsense_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Abductive_Alphanlg.jsonl',
+         abbr='reasonbench_en_abductive_alphanlg_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Deductive_BBH7obj.jsonl',
+         abbr='reasonbench_en_deductive_bbh7obj_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Deductive_LogiQA_zh_translated.jsonl',
+         abbr='reasonbench_en_deductive_logiqa_zh_translated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Deductive_OCNLI_translated.jsonl',
+         abbr='reasonbench_en_deductive_ocnli_translated_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Inductive_DEER.jsonl',
+         abbr='reasonbench_en_inductive_deer_circular', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1/reason/EN_Inductive_selfgenerated.jsonl',
+         abbr='reasonbench_en_inductive_selfgenerated_circular', data_type='circular-mcq', infer_method='gen'),
+]
diff --git a/configs/datasets/compassbench_20_v1_1_public/agent/cibench_template_gen_e6b12a.py b/configs/datasets/compassbench_20_v1_1_public/agent/cibench_template_gen_e6b12a.py
new file mode 100644
index 00000000..a0c52ab4
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/agent/cibench_template_gen_e6b12a.py
@@ -0,0 +1,57 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+
+libs = [
+    '/lightgbm',
+    '/matplotlib',
+    '/nltk',
+    '/opencv',
+    '/pandas',
+    '/pytorch',
+    '/scipy',
+    '/seaborn',
+    '/sklearn',
+    '/tensorflow',
+    '_chinese/lightgbm',
+    '_chinese/matplotlib',
+    '_chinese/nltk',
+    '_chinese/opencv',
+    '_chinese/pandas',
+    '_chinese/pytorch',
+    '_chinese/scipy',
+    '_chinese/seaborn',
+    '_chinese/sklearn',
+    '_chinese/tensorflow',
+]
+
+
+cibench_datasets = []
+for lib in libs:
+    cibench_reader_cfg = dict(
+        input_columns=['questions'], output_column='references', train_split='test', test_split='test'
+    )
+
+    cibench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template='{questions}',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=AgentInferencer, infer_mode='every'),
+    )
+
+    cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role='BOT')
+
+    cibench_datasets.append(
+        dict(
+            abbr=f'cibench_template{lib}_public',
+            type=CIBenchDataset,
+            path=f'data/compassbench_v1.1.public/agent-cibench/cibench_template{lib}',
+            internet_check=False,
+            reader_cfg=cibench_reader_cfg,
+            infer_cfg=cibench_infer_cfg,
+            eval_cfg=cibench_eval_cfg,
+        )
+    )
diff --git a/configs/datasets/compassbench_20_v1_1_public/agent/mus_teval_gen_105c48.py b/configs/datasets/compassbench_20_v1_1_public/agent/mus_teval_gen_105c48.py
new file mode 100644
index 00000000..fd86fe4c
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/agent/mus_teval_gen_105c48.py
@@ -0,0 +1,56 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.openicl.icl_evaluator import TEvalEvaluator
+from opencompass.datasets import teval_postprocess, TEvalDataset
+
+plugin_eval_subject_mapping = {
+    'instruct': ['instruct_v1'],
+    'instruct_zh': ['instruct_v1_zh'],
+    'plan': ['plan_json_v1', 'plan_str_v1'],
+    'plan_zh': ['plan_json_v1_zh', 'plan_str_v1_zh'],
+    'review': ['review_str_v1'],
+    'review_zh': ['review_str_v1_zh'],
+    'reason_retrieve_understand': ['reason_retrieve_understand_json_v1'],
+    'reason_retrieve_understand_zh': ['reason_retrieve_understand_json_v1_zh'],
+    'reason': ['reason_str_v1'],
+    'reason_zh': ['reason_str_v1_zh'],
+    'retrieve': ['retrieve_str_v1'],
+    'retrieve_zh': ['retrieve_str_v1_zh'],
+    'understand': ['understand_str_v1'],
+    'understand_zh': ['understand_str_v1_zh'],
+}
+
+plugin_eval_datasets = []
+for _name in plugin_eval_subject_mapping:
+    plugin_eval_reader_cfg = dict(input_columns=['prompt'], output_column='ground_truth')
+    plugin_eval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=ChatInferencer),
+    )
+    plugin_eval_eval_cfg = dict(
+        evaluator=dict(type=TEvalEvaluator, subset=_name),
+        pred_postprocessor=dict(type=teval_postprocess),
+        num_gpus=1,
+    )
+
+    for subset in plugin_eval_subject_mapping[_name]:
+        plugin_eval_datasets.append(
+            dict(
+                abbr='plugin_eval-mus-p10-' + subset + '_public',
+                type=TEvalDataset,
+                path='data/compassbench_v1.1.public/agent-teval-p10',
+                name=subset,
+                reader_cfg=plugin_eval_reader_cfg,
+                infer_cfg=plugin_eval_infer_cfg,
+                eval_cfg=plugin_eval_eval_cfg,
+            )
+        )
diff --git a/configs/datasets/compassbench_20_v1_1_public/code/compassbench_v1_1_code_gen_986f01.py b/configs/datasets/compassbench_20_v1_1_public/code/compassbench_v1_1_code_gen_986f01.py
new file mode 100644
index 00000000..b9513914
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/code/compassbench_v1_1_code_gen_986f01.py
@@ -0,0 +1,291 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import MBPPDataset, SanitizedMBPPDataset, MBPPEvaluator
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+from opencompass.datasets import LCDataset, LCPassKEvaluator
+from opencompass.datasets import TACODataset, TACOEvaluator
+
+compassbench_v1_1_code_datasets = []
+
+# --------------------------------------------------------------- HumanEval CN ---------------------------------------------------------------
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='完成以下Python代码任务:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        abbr='openai_humaneval_cn_public',
+        type=HumanevalDataset,
+        path='data/compassbench_v1.1.public/code/humaneval_cn/human-eval-cn-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- HumanEval Plus ---------------------------------------------------------------
+humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Complete the following python code:\n{prompt}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        abbr='humaneval_plus_public',
+        type=HumanevalDataset,
+        path='data/compassbench_v1.1.public/code/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- MBPP CN ---------------------------------------------------------------
+mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2', test_range='[0:5]')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个函数，从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试：\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试：\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是：编写一个函数，使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试：\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp_cn_public',
+        path='data/compassbench_v1.1.public/code/mbpp_cn/mbpp_cn.jsonl',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- Sanitized MBPP ---------------------------------------------------------------
+sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2', test_range='[0:5]')
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+compassbench_v1_1_code_datasets.append(
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr='sanitized_mbpp_public',
+        path='data/compassbench_v1.1.public/code/mbpp/sanitized-mbpp.jsonl',
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+)
+
+# --------------------------------------------------------------- HumanevalX ---------------------------------------------------------------
+humanevalx_reader_cfg = dict(input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+humanevalx_eval_cfg_dict = {
+    lang: dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001,
+        ),  # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
+    )
+    for lang in ['python', 'cpp', 'go', 'java', 'js']  # do not support rust now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+for lang in ['python', 'cpp', 'go', 'java', 'js']:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=HumanevalXDataset,
+            abbr=f'humanevalx-{lang}_public',
+            language=lang,
+            path='data/compassbench_v1.1.public/code/humanevalx',
+            reader_cfg=humanevalx_reader_cfg,
+            infer_cfg=humanevalx_infer_cfg,
+            eval_cfg=humanevalx_eval_cfg_dict[lang],
+        )
+    )
+
+# --------------------------------------------------------------- LCBench ---------------------------------------------------------------
+LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
+LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
+
+
+LC_en_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_cn_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务： 给你三个 正整数 n 、x 和 y 。\n在城市中，存在编号从 1 到 n 的房屋，由 n 条街道相连。对所有 1 <= i < n ，都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k（1 <= k <= n），你需要找出所有满足要求的 房屋对 [house1, house2] ，即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ，其中 result[k] 表示所有满足要求的房屋对的数量，即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意，x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n        x, y = min(x, y), max(x, y)\n        A = [0] * n\n        for i in range(1, n + 1):\n            A[0] += 2                                   \n            A[min(i - 1, abs(i - y) + x)] -= 1          \n            A[min(n - i, abs(i - x) + 1 + n - y)] -= 1  \n            A[min(abs(i - x), abs(y - i) + 1)] += 1     \n            A[min(abs(i - x) + 1, abs(y - i))] += 1     \n            r = max(x - i, 0) + max(i - y, 0)\n            A[r + (y - x + 0) // 2] -= 1                \n            A[r + (y - x + 1) // 2] -= 1                \n        return list(accumulate(A))' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个字符串 word，由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射，可以通过按压按键来组成单词。例如，按键 2 对应 ["a","b","c"]，我们需要按一次键来输入 "a"，按两次键来输入 "b"，按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母，但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1，*，# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n        letter_counts = {}\n        for c in word:\n            letter_counts[c] = letter_counts.get(c, 0) + 1\n        counts = list(letter_counts.values())\n        counts.sort(reverse=True)\n        ans, row = 0, 1\n        for i in range(len(counts)):\n            if i > 7 and i % 8 == 0:\n                row += 1\n            ans += row * counts[i]\n        return ans' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ，满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意：\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号，比方说如果 s == 11100 ，那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n        def check(v):\n            A = list(map(int, bin(v)[2:]))\n            n = len(A)\n            res = p = 0\n            for i,v in enumerate(A):\n                if v == 1:\n                    l = n - i - 1\n                    res += (p << l) + ((l // x) << (l - 1) if l else 0)\n                if (n - i) % x == 0:\n                    p += v\n            return res + p\n        l, r = 1, 10 ** 15\n        while l < r:\n            mid = (l + r + 1) // 2\n            if check(mid) <= k:\n                l = mid\n            else:\n                r = mid - 1\n        return l' \n[DONE] \n\n "),
+                dict(role='HUMAN', prompt='你是一名专业的 Python 程序员，下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+
+LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
+
+for difficulty in LC_difficulties_list:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_en-' + difficulty + '_public',
+            path='data/compassbench_v1.1.public/code/LCBench2023/LCBench2023.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_en_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=LCDataset,
+            abbr='lcbench_cn-' + difficulty + '_public',
+            path='data/compassbench_v1.1.public/code/LCBench2023/LCBench2023_cn.jsonl',
+            difficulty=difficulty,
+            reader_cfg=LC_reader_cfg,
+            infer_cfg=LC_cn_infer_cfg,
+            eval_cfg=LC_eval_cfg,
+        )
+    )
+
+
+# --------------------------------------------------------------- TACO ---------------------------------------------------------------
+TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
+TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test', test_range='[0:5]')
+
+TACO_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
+
+for difficulty in TACO_difficulties_list:
+    compassbench_v1_1_code_datasets.append(
+        dict(
+            type=TACODataset,
+            abbr='TACO-' + difficulty + '_public',
+            path='data/compassbench_v1.1.public/code/BAAI-TACO',
+            difficulty=difficulty,
+            reader_cfg=TACO_reader_cfg,
+            infer_cfg=TACO_infer_cfg,
+            eval_cfg=TACO_eval_cfg,
+        )
+    )
diff --git a/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py b/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
new file mode 100644
index 00000000..3c9dd284
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/knowledge/compassbench_v1_knowledge_gen_bd74e0.py
@@ -0,0 +1,133 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import WikiBenchDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+# ----------------------- Prompt Content----------------------- #
+zero_shot_prompts = {
+    'single_choice_prompts': [
+        dict(role='HUMAN', prompt='以下是一道单项选择题，请你根据你了解的知识给出正确的答案选项。\n下面是你要回答的题目：：\n{question}\n答案选项：'),
+        dict(role='BOT', prompt='{answer}')
+    ]
+}
+few_shot_prompts = {
+    'single_choice_prompts': {
+        'single_choice_cn': [
+            dict(role='HUMAN', prompt='题目：“一丝不苟”中的“丝”的本意是（  ）。\nA. 计量单位\nB. 丝线\nC. 丝绸\nD. 发丝'),
+            dict(role='BOT', prompt='答案：A'),
+            dict(role='HUMAN', prompt='题目：五华县体育场位于哪个省？\nA. 湖南省\nB. 浙江省\nC. 广东省\nD. 江苏省'),
+            dict(role='BOT', prompt='答案：C'),
+            dict(role='HUMAN', prompt='题目：“西施犬的原产地是哪里？\nA. 印度\nB. 中国\nC. 西藏\nD. 台湾'),
+            dict(role='BOT', prompt='答案：C'),
+            dict(role='HUMAN', prompt='题目：四库全书的四库是指什么？\nA. 易、书、诗、礼\nB. 经、史、子、音\nC. 诗、书、音、律\nD. 经、史、子、集'),
+            dict(role='BOT', prompt='答案：D'),
+            dict(role='HUMAN', prompt='题目：{question}'),
+        ]}
+}
+
+
+# ----------------------- Prompt Template----------------------- #
+
+# Use Zero-Shot or not
+with_few_shot = True
+
+# Max for this dataset is 4, should be set with `with_few_shot`
+few_shot_samples = 4
+
+# Use circular evaluation or not
+with_circular_eval = True
+
+single_choice_prompts = zero_shot_prompts['single_choice_prompts'] if not with_few_shot else few_shot_prompts['single_choice_prompts']
+
+# Set few shot prompt number
+if with_few_shot:
+    assert few_shot_samples > 0
+    for _name in list(single_choice_prompts.keys()):
+        single_choice_prompts[_name] = single_choice_prompts[_name][- few_shot_samples * 2 - 2:]
+
+compassbench_v1_knowledge_sets = {
+    'common_knowledge': ['single_choice_cn'],
+    'humanity': ['single_choice_cn'],
+    'natural_science': ['single_choice_cn'],
+    'social_science': ['single_choice_cn'],
+}
+
+
+# ----------------------- Dataset Config----------------------- #
+compassbench_v1_knowledge_datasets = []
+
+for _split in list(compassbench_v1_knowledge_sets.keys()):
+    for _name in compassbench_v1_knowledge_sets[_split]:
+        compassbench_v1_knowledge_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+        compassbench_v1_knowledge_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=single_choice_prompts[_name]
+                ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        )
+        compassbench_v1_knowledge_eval_cfg = dict(
+            evaluator=dict(type=CircularEvaluator if with_circular_eval else AccEvaluator),
+            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+        )
+
+        compassbench_v1_knowledge_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                abbr='compassbench_v1_knowledge-' + _split + '-' + _name + '_' + ('circular' if with_circular_eval else '') + '_public',
+                path=f'data/compassbench_v1.1.public/knowledge/{_split}/{_name}.jsonl',
+                name=_name + '_circular' if with_circular_eval else _name,
+                reader_cfg=compassbench_v1_knowledge_reader_cfg,
+                infer_cfg=compassbench_v1_knowledge_infer_cfg,
+                eval_cfg=compassbench_v1_knowledge_eval_cfg,
+            )
+        )
+
+
+from opencompass.datasets import TriviaQADataset_V3, TriviaQAEvaluator
+
+triviaqa_and_nq_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+triviaqa_and_nq_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: When do stores stop selling alcohol in indiana?'),
+                dict(role='BOT', prompt='Answer: 3 a.m.'),
+                dict(role='HUMAN', prompt='Question: Edinburgh of the Seven Seas is the capital of which group of islands?'),
+                dict(role='BOT', prompt='Answer: Tristan da Cunha'),
+                dict(role='HUMAN', prompt='Question: Which book of the Christian Bible\'s new testament comprises a letter from St Paul to members of a church that he had founded at Macedonia?'),
+                dict(role='BOT', prompt='Answer: Philippians'),
+                dict(role='HUMAN', prompt='Question: The Hindu deity Hanuman appears in the form of which animal?'),
+                dict(role='BOT', prompt='Answer: A monkey'),
+                dict(role='HUMAN', prompt='Question: Who hosts the ITV quiz show The Chase?'),
+                dict(role='BOT', prompt='Answer: Bradley Walsh'),
+                dict(role='HUMAN', prompt='Question: {question}'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=['Question:']),
+)
+
+triviaqa_and_nq_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
+
+compassbench_v1_knowledge_datasets.append(
+    dict(
+        type=TriviaQADataset_V3,
+        abbr='compassbench_v1_knowledge-mixed-cloze_en_public',
+        path='data/compassbench_v1.1.public/knowledge/mixed/cloze_en.jsonl',
+        reader_cfg=triviaqa_and_nq_reader_cfg,
+        infer_cfg=triviaqa_and_nq_infer_cfg,
+        eval_cfg=triviaqa_and_nq_eval_cfg
+    )
+)
diff --git a/configs/datasets/compassbench_20_v1_1_public/language/compassbench_v1_language_gen_7aa06d.py b/configs/datasets/compassbench_20_v1_1_public/language/compassbench_v1_language_gen_7aa06d.py
new file mode 100644
index 00000000..d5dfec25
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/language/compassbench_v1_language_gen_7aa06d.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_evaluator import EMEvaluator, BleuFloresEvaluator, RougeEvaluator, JiebaRougeEvaluator
+
+compassbench_v1_language_datasets = [
+     # dict(path='data/compassbench_v1.1.public/language/information_retrieval_en.jsonl',
+     #      abbr='information_retrieval_en_public', data_type= 'qa', infer_method='gen', output_column='answers',
+     #      human_prompt='{context}\n{origin_question}\nConcise and direct answer is',
+     #      evaluator=EMEvaluator),
+     # dict(path='data/compassbench_v1.1.public/language/information_retrieval_zh.jsonl',
+     #      abbr='information_retrieval_zh_public', data_type= 'qa', infer_method='gen', output_column='answers',
+     #      human_prompt='{context}\n{origin_question}\n简单直接的回答是',
+     #      evaluator=EMEvaluator),
+
+     dict(path='data/compassbench_v1.1.public/language/intention_recognition_en.jsonl',
+          abbr='intention_recognition_en_circular_public', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1.public/language/intention_recognition_zh.jsonl',
+          abbr='intention_recognition_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1.public/language/sentiment_analysis_en.jsonl',
+          abbr='sentiment_analysis_en_circular_public', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1.public/language/sentiment_analysis_zh.jsonl',
+          abbr='sentiment_analysis_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1.public/language/translation.jsonl',
+          abbr='translation_public', data_type= 'qa', infer_method='gen',
+          evaluator=BleuFloresEvaluator),
+
+     dict(path='data/compassbench_v1.1.public/language/content_critic_en.jsonl',
+          abbr='content_critic_en_circular_public', data_type='circular-mcq', infer_method='gen'),
+     dict(path='data/compassbench_v1.1.public/language/content_critic_zh.jsonl',
+          abbr='content_critic_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1.public/language/content_summarization_en.jsonl',
+          abbr='content_summarization_en_public', data_type= 'qa', infer_method='gen', output_column='summary',
+          human_prompt='{article}\nSummary of the article is:\n',
+          evaluator=RougeEvaluator),
+     dict(path='data/compassbench_v1.1.public/language/content_summarization_zh.jsonl',
+          abbr='content_summarization_zh_public', data_type= 'qa', infer_method='gen', output_column='summary',
+          human_prompt='{article}\n上述内容摘要如下：\n',
+          evaluator=JiebaRougeEvaluator),
+
+     dict(path='data/compassbench_v1.1.public/language/traditional_cultural_understanding_zh.jsonl',
+          abbr='traditional_cultural_understanding_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+
+     dict(path='data/compassbench_v1.1.public/language/chinese_semantic_understanding_zh.jsonl',
+          abbr='chinese_semantic_understanding_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+]
diff --git a/configs/datasets/compassbench_20_v1_1_public/math/compassbench_v1_1_math_gen_1dc21d.py b/configs/datasets/compassbench_20_v1_1_public/math/compassbench_v1_1_math_gen_1dc21d.py
new file mode 100644
index 00000000..e615b0d3
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/math/compassbench_v1_1_math_gen_1dc21d.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+compassbench_v1_1_math_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=2048),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        compassbench_v1_1_math_datasets.append(
+            dict(
+                abbr='mathbench-' + _split + '-' + _name + '_public',
+                type=MathBenchDataset,
+                path=f'data/compassbench_v1.1.public/math/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
diff --git a/configs/datasets/compassbench_20_v1_1_public/math/mathbench_prompt.py b/configs/datasets/compassbench_20_v1_1_public/math/mathbench_prompt.py
new file mode 100644
index 00000000..069528ee
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/math/mathbench_prompt.py
@@ -0,0 +1,103 @@
+zero_shot_prompts = {
+    'single_choice_cn_with_reasoning': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：'),
+    ],
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n所以答案是：'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
+    ],
+}
+
+few_shot_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: A'),
+        dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2，高为4.一个圆柱的下底面在圆锥的底面上，上底面的圆周在圆锥的侧面上，当圆柱侧面积为$4 \\pi$时，该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_cn_with_reasoning' : [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: 因为首先，我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$，以便消去分母中的虚数部分：$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$，这样做的目的是利用复数乘法的性质，从而消去分母中的虚数部分。我们进行乘法运算：$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$，$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$，由于$i^2 = -1$，我们可以进一步简化：$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$，在复平面上，这个点位于第四象限，因为它的实部是正的（$\\frac{1}{2}$），而虚部是负的（$-\\frac{1}{2}$）\n答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$（其中$\\omega>0$）的图像向左平移2个单位长度后，得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合，我们需要满足以下条件：\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性，我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$，其中$k$是任意整数。因此，我们可以写出：\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程，我们得到：$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$，我们需要找到满足条件的最小正整数$k$。当$k=0$时，我们得到$\\omega=2$，这是满足条件的最小正值。因此，最小值等于2。答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: 我们可以将这个方程写成：$a + 2ai + b = 2i$，然后将实部和虚部分开：\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$，我们可以得到$a = -b$。将这个值代入$2a = 2$，我们得到$2(-b) = 2$，因此$b = -1$。因此，$a = 1$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4．则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
+        dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们，如果一个三角形的两条边长分别为$a$和$b$，那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中，$a = 3$，$b = 4$，因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: B'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'cloze_cn': [
+        dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？'),
+        dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9'),
+        dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？'),
+        dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29'),
+        dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？'),
+        dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33'),
+        dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？'),
+        dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'cloze_en': [
+        dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
+        dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
+        dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
+        dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
+        dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
+        dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
+        dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
+        dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+}
+
+mathbench_sets = {
+    # Practice Part
+    'college': ['single_choice_cn', 'single_choice_en'],
+    'high': ['single_choice_cn', 'single_choice_en'],
+    'middle': ['single_choice_cn', 'single_choice_en'],
+    'primary': ['cloze_cn', 'cloze_en'],
+    'arithmetic': ['cloze_en'],
+    # Theory part
+    'college_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'high_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
+}
diff --git a/configs/datasets/compassbench_20_v1_1_public/reason/compassbench_v1_reason_gen_d26d08.py b/configs/datasets/compassbench_20_v1_1_public/reason/compassbench_v1_reason_gen_d26d08.py
new file mode 100644
index 00000000..a2180661
--- /dev/null
+++ b/configs/datasets/compassbench_20_v1_1_public/reason/compassbench_v1_reason_gen_d26d08.py
@@ -0,0 +1,28 @@
+compassbench_v1_reason_datasets = [
+    dict(path='data/compassbench_v1.1.public/reason/CN_Commonsense.jsonl',
+         abbr='reasonbench_cn_commonsense_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/CN_Abductive_Alphanlg_translated.jsonl',
+         abbr='reasonbench_cn_abductive_alphanlg_translated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/CN_Deductive_BBH3obj_translated.jsonl',
+         abbr='reasonbench_cn_deductive_bbh3obj_translated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/CN_Deductive_LogiQA_zh.jsonl',
+         abbr='reasonbench_cn_deductive_logiqa_zh_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/CN_Inductive_DEER_translated.jsonl',
+         abbr='reasonbench_cn_inductive_deer_translated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/CN_Inductive_selfgenerated.jsonl',
+         abbr='reasonbench_cn_inductive_selfgenerated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Commonsense.jsonl',
+         abbr='reasonbench_en_commonsense_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Abductive_Alphanlg.jsonl',
+         abbr='reasonbench_en_abductive_alphanlg_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Deductive_BBH7obj.jsonl',
+         abbr='reasonbench_en_deductive_bbh7obj_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Deductive_LogiQA_zh_translated.jsonl',
+         abbr='reasonbench_en_deductive_logiqa_zh_translated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Deductive_OCNLI_translated.jsonl',
+         abbr='reasonbench_en_deductive_ocnli_translated_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Inductive_DEER.jsonl',
+         abbr='reasonbench_en_inductive_deer_circular_public', data_type='circular-mcq', infer_method='gen'),
+    dict(path='data/compassbench_v1.1.public/reason/EN_Inductive_selfgenerated.jsonl',
+         abbr='reasonbench_en_inductive_selfgenerated_circular_public', data_type='circular-mcq', infer_method='gen'),
+]
diff --git a/configs/datasets/drop/drop_gen_8a9ed9.py b/configs/datasets/drop/deprecated_drop_gen_8a9ed9.py
similarity index 100%
rename from configs/datasets/drop/drop_gen_8a9ed9.py
rename to configs/datasets/drop/deprecated_drop_gen_8a9ed9.py
diff --git a/configs/datasets/drop/drop_gen_e18aaf.py b/configs/datasets/drop/drop_examples.py
similarity index 78%
rename from configs/datasets/drop/drop_gen_e18aaf.py
rename to configs/datasets/drop/drop_examples.py
index edc51b29..b977519a 100644
--- a/configs/datasets/drop/drop_gen_e18aaf.py
+++ b/configs/datasets/drop/drop_examples.py
@@ -1,18 +1,4 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import DropOpenAIDataset, DropOpenAIEvaluator
-
-drop_reader_cfg = dict(
-    input_columns=['prompt'],
-    output_column='answers',
-    train_split='validation',
-    test_split='validation',
-)
-
-template = '''\
-You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.
-
+drop_examples = '''\
 # Examples
 ---
 Passage: Trunajaya rebellion  or Trunajaya War was the ultimately unsuccessful rebellion waged by the Madurese prince Trunajaya and fighters from Makassar against the Mataram Sultanate and its Dutch East India Company  supporters in Java  during the 1670s. The rebellion was initially successful: the rebels defeated the royal army at Gegodog , captured most of the Javanese north coast, and took the Mataram capital Plered . King Amangkurat I died during the retreat of the royal court. His son and successor, Amangkurat II, requested help from the VOC in exchange for financial remuneration and geopolitical concessions. The VOC\'s subsequent involvement turned the tide of the war. VOC and Mataram forces recovered lost territories and overran Trunajaya\'s new capital at Kediri . However, the rebellion continued until the capture of Trunajaya at the end of 1679, and the defeat, death, or surrender of the other rebel leaders . Trunajaya was killed by Amangkurat II personally in 1680 while a prisoner of the VOC. After his father\'s death in 1677, Amangkurat II also faced rival claims to the throne. The most serious rival was his brother Pangeran Puger, who took the capital Plered in 1677 and did not surrender until 1681.
@@ -27,29 +13,4 @@ Answer:  Jacobs
 ---
 Passage: Hoping to rebound from their road loss to the Bills, the Chargers flew to Wembley Stadium for the 2008 International Series game with the New Orleans Saints. In the first quarter, San Diego trailed early as kicker Taylor Mehlhaff got a 23-yard field goal.  The \'Bolts would respond with kicker Nate Kaeding getting a 33-yard field goal.  In the second quarter, New Orleans regained the lead as QB Drew Brees (a former Charger) completed a 12-yard TD pass to WR Devery Henderson (with a failed PAT) and RB Deuce McAllister getting a 1-yard TD run.  San Diego answered as QB Philip Rivers completed a 12-yard TD pass to RB LaDainian Tomlinson, but the Saints replied with Brees completing a 30-yard TD pass to WR Lance Moore.  The Chargers closed out the half with Rivers completing a 12-yard TD pass to TE Antonio Gates. In the third quarter, New Orleans increased its lead Brees completing a 1-yard TD pass to TE Mark Campbell, after a very controversial Pass interference call on cornerback Cletis Gordon put the Saints on the 1-yard line.  The \'Bolts would answer with Kaeding getting a 24-yard field goal.  In the fourth quarter, the Saints continued to build its lead as FB Mike Karney got a 1-yard TD run.  San Diego tried to rally as Kaeding nailed a 31-yard field goal, Rivers completed a 14-yard TD pass to WR Vincent Jackson, and Brees giving the \'Bolts a safety via an incomplete pass thrown into the back of his own endzone.  However, New Orleans\' defense stiffened for the win. With the loss, the Chargers went into their bye week at 3-5.
 Question: How many total yards of touchdown passes did Drew Brees make?
-Answer:  43
-
-# Your Task
-
----
-{prompt}
-
-Think step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.
-'''
-
-drop_infer_cfg = dict(
-    prompt_template=dict(type=PromptTemplate, template=template),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer))
-
-drop_eval_cfg = dict(evaluator=dict(type=DropOpenAIEvaluator))
-
-drop_datasets = [
-    dict(
-        abbr='drop',
-        type=DropOpenAIDataset,
-        path='./data/drop_simple_eval/dev.jsonl',
-        reader_cfg=drop_reader_cfg,
-        infer_cfg=drop_infer_cfg,
-        eval_cfg=drop_eval_cfg)
-]
+Answer:  43'''
diff --git a/configs/datasets/drop/drop_gen.py b/configs/datasets/drop/drop_gen.py
index 473b615d..69954a35 100644
--- a/configs/datasets/drop/drop_gen.py
+++ b/configs/datasets/drop/drop_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .drop_gen_e18aaf import drop_datasets
+    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets
diff --git a/configs/datasets/drop/drop_gen_a2697c.py b/configs/datasets/drop/drop_gen_a2697c.py
new file mode 100644
index 00000000..5a23a8f5
--- /dev/null
+++ b/configs/datasets/drop/drop_gen_a2697c.py
@@ -0,0 +1,43 @@
+# USED IN BASE MODEL
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DropOpenAIDataset, DropOpenAIEvaluator
+
+with read_base():
+    from .drop_examples import drop_examples  # noqa: F401, F403
+
+drop_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answers',
+    train_split='validation',
+    test_split='validation',
+)
+
+template = f'''\
+You will be asked to read a passage and answer a question. Think step by step, then write a line of the form "Answer: $ANSWER" at the end of your response. Some examples of passages and Q&A are provided below.
+
+{drop_examples}
+
+# Your Task
+
+---
+{{prompt}}'''
+
+drop_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=dict(round=[dict(role='HUMAN', prompt=template)])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, stopping_criteria=['---', 'Passage', 'Question', 'You will be asked']),)
+
+drop_eval_cfg = dict(evaluator=dict(type=DropOpenAIEvaluator))
+
+drop_datasets = [
+    dict(
+        abbr='drop',
+        type=DropOpenAIDataset,
+        path='data/drop_simple_eval/dev.jsonl',
+        reader_cfg=drop_reader_cfg,
+        infer_cfg=drop_infer_cfg,
+        eval_cfg=drop_eval_cfg)
+]
diff --git a/configs/datasets/drop/drop_gen_eb14af.py b/configs/datasets/drop/drop_gen_eb14af.py
new file mode 100644
index 00000000..d3d523e6
--- /dev/null
+++ b/configs/datasets/drop/drop_gen_eb14af.py
@@ -0,0 +1,34 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DropOpenAIDataset, DropOpenAIEvaluator
+
+with read_base():
+    from .drop_examples import drop_examples  # noqa: F401, F403
+
+drop_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answers',
+    train_split='validation',
+    test_split='validation',
+)
+
+template = f'You will be asked to read a passage and answer a question. Think step by step, then write a line of the form "Answer: $ANSWER" at the end of your response. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}'
+
+drop_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=dict(round=[dict(role='HUMAN', prompt=template)])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+drop_eval_cfg = dict(evaluator=dict(type=DropOpenAIEvaluator))
+
+drop_datasets = [
+    dict(
+        abbr='drop',
+        type=DropOpenAIDataset,
+        path='data/drop_simple_eval/dev.jsonl',
+        reader_cfg=drop_reader_cfg,
+        infer_cfg=drop_infer_cfg,
+        eval_cfg=drop_eval_cfg)
+]
diff --git a/configs/datasets/drop/drop_openai_simple_evals_gen_3857b0.py b/configs/datasets/drop/drop_openai_simple_evals_gen_3857b0.py
new file mode 100644
index 00000000..a46e2fde
--- /dev/null
+++ b/configs/datasets/drop/drop_openai_simple_evals_gen_3857b0.py
@@ -0,0 +1,34 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DropOpenAIDataset, DropOpenAIEvaluator
+
+with read_base():
+    from .drop_examples import drop_examples  # noqa: F401, F403
+
+drop_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answers',
+    train_split='validation',
+    test_split='validation',
+)
+
+template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
+
+drop_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=dict(round=[dict(role='HUMAN', prompt=template)])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+drop_eval_cfg = dict(evaluator=dict(type=DropOpenAIEvaluator))
+
+drop_datasets = [
+    dict(
+        abbr='drop',
+        type=DropOpenAIDataset,
+        path='data/drop_simple_eval/dev.jsonl',
+        reader_cfg=drop_reader_cfg,
+        infer_cfg=drop_infer_cfg,
+        eval_cfg=drop_eval_cfg)
+]
diff --git a/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py
new file mode 100644
index 00000000..d958b88e
--- /dev/null
+++ b/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_a58960.py
@@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+from opencompass.datasets import MATHEvaluator, math_postprocess_v2
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
+)
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='./data/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg,
+    )
+]
diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md
index c0e1e157..ce004785 100644
--- a/configs/datasets/humaneval/README.md
+++ b/configs/datasets/humaneval/README.md
@@ -1,7 +1,7 @@
 # HumanEval
 
 ```bash
-python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
 python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
 ```
 
diff --git a/configs/datasets/humaneval/humaneval_gen_4a6eef.py b/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
similarity index 90%
rename from configs/datasets/humaneval/humaneval_gen_4a6eef.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
index 81c7023f..1eac2c69 100644
--- a/configs/datasets/humaneval/humaneval_gen_4a6eef.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
diff --git a/configs/datasets/humaneval/humaneval_gen_6d1cc2.py b/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
similarity index 90%
rename from configs/datasets/humaneval/humaneval_gen_6d1cc2.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
index 9740039e..c1f2aca5 100644
--- a/configs/datasets/humaneval/humaneval_gen_6d1cc2.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
diff --git a/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py b/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py
deleted file mode 100644
index 2b9e20ff..00000000
--- a/configs/datasets/humaneval/deprecated_humaneval_gen_8e312c.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
-
-humaneval_reader_cfg = dict(
-    input_columns=['prompt'], output_column='task_id', train_split='test')
-
-# TODO: allow empty output-column
-humaneval_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(round=[
-            dict(
-                role='HUMAN',
-                prompt='Complete the following python code:\n{prompt}'),
-        ])),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
-
-humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
-    pred_role='BOT',
-    k=[1, 10, 100],  # the parameter only for humaneval
-    pred_postprocessor=dict(type=humaneval_postprocess),
-)
-
-humaneval_datasets = [
-    dict(
-        abbr='openai_humaneval',
-        type=HumanevalDataset,
-        path='./data/humaneval/human-eval-v2-20210705.jsonl',
-        reader_cfg=humaneval_reader_cfg,
-        infer_cfg=humaneval_infer_cfg,
-        eval_cfg=humaneval_eval_cfg)
-]
diff --git a/configs/datasets/humaneval/humaneval_gen_a82cae.py b/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
similarity index 88%
rename from configs/datasets/humaneval/humaneval_gen_a82cae.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
index 851d5d6f..78239204 100644
--- a/configs/datasets/humaneval/humaneval_gen_a82cae.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval/humaneval_gen_d2537e.py b/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
similarity index 87%
rename from configs/datasets/humaneval/humaneval_gen_d2537e.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
index a416ee32..07158f0b 100644
--- a/configs/datasets/humaneval/humaneval_gen_d2537e.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -16,7 +16,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval/humaneval_gen_fd5822.py b/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
similarity index 87%
rename from configs/datasets/humaneval/humaneval_gen_fd5822.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
index 6d0c9903..ff28e89e 100644
--- a/configs/datasets/humaneval/humaneval_gen_fd5822.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -15,7 +15,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
 )
diff --git a/configs/datasets/humaneval/humaneval_gen_ff7054.py b/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
similarity index 90%
rename from configs/datasets/humaneval/humaneval_gen_ff7054.py
rename to configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
index 35b2d9d0..63343c7c 100644
--- a/configs/datasets/humaneval/humaneval_gen_ff7054.py
+++ b/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -24,7 +24,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess),
diff --git a/configs/datasets/humaneval/humaneval_gen_66a7f4.py b/configs/datasets/humaneval/humaneval_gen_66a7f4.py
new file mode 100644
index 00000000..a5c4ebfd
--- /dev/null
+++ b/configs/datasets/humaneval/humaneval_gen_66a7f4.py
@@ -0,0 +1,35 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+HUMANEVAL_TEMPLATE = dict(
+    round=[
+        dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
+    ]
+)
+
+humaneval_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    k=[1, 10, 100],
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+]
diff --git a/configs/datasets/humaneval/humaneval_gen_6f294d.py b/configs/datasets/humaneval/humaneval_gen_6f294d.py
deleted file mode 100644
index 3d0eeaa1..00000000
--- a/configs/datasets/humaneval/humaneval_gen_6f294d.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
-
-humaneval_reader_cfg = dict(
-    input_columns=['prompt'], output_column='task_id', train_split='test')
-
-# TODO: allow empty output-column
-humaneval_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[
-                dict(
-                    role='SYSTEM',
-                    fallback_role='HUMAN',
-                    prompt='Complete the following python code:'),
-            ],
-            round=[
-                dict(role='HUMAN', prompt='{prompt}'),
-            ])),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
-
-humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
-    pred_role='BOT',
-    k=[1, 10, 100],  # the parameter only for humaneval
-    pred_postprocessor=dict(type=humaneval_postprocess),
-)
-
-humaneval_datasets = [
-    dict(
-        abbr='openai_humaneval',
-        type=HumanevalDataset,
-        path='./data/humaneval/human-eval-v2-20210705.jsonl',
-        reader_cfg=humaneval_reader_cfg,
-        infer_cfg=humaneval_infer_cfg,
-        eval_cfg=humaneval_eval_cfg)
-]
diff --git a/configs/datasets/humaneval/humaneval_gen_8e312c.py b/configs/datasets/humaneval/humaneval_gen_8e312c.py
index 8a0c6c7c..93d884c9 100644
--- a/configs/datasets/humaneval/humaneval_gen_8e312c.py
+++ b/configs/datasets/humaneval/humaneval_gen_8e312c.py
@@ -1,7 +1,8 @@
+# THIS SHALL ALSO BE DEPRECATED
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +20,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py b/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
index 2050eb74..dd06990e 100644
--- a/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
+++ b/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
index d1696511..d9439c40 100644
--- a/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
+++ b/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
index 5eff32c2..f2bf918d 100644
--- a/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
+++ b/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py
index 0dcff6b3..c8611b5a 100644
--- a/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py
+++ b/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py
index 4073824f..1dda06d2 100644
--- a/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py
+++ b/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py
index 3cafadc3..34d2c5e4 100644
--- a/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py
+++ b/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator),
+    evaluator=dict(type=HumanEvalEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py b/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py
new file mode 100644
index 00000000..cbe5a8dc
--- /dev/null
+++ b/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py
@@ -0,0 +1,35 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+
+humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+HUMANEVAL_TEMPLATE = dict(
+    round=[
+        dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
+    ]
+)
+
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    k=[1, 10, 100],
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+    )
+]
diff --git a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py
index f1f61387..de988c09 100644
--- a/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py
+++ b/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py
@@ -1,7 +1,8 @@
+# THIS SHALL ALSO BE DEPRECATED
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
 
 humaneval_plus_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +20,7 @@ humaneval_plus_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_plus_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator, metric='EvalPlus'),
+    evaluator=dict(type=HumanEvalPlusEvaluator),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
index 1ff4c4dd..f2ba72a3 100644
--- a/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
+++ b/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_plus_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_plus_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_plus_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator, metric='EvalPlus'),
+    evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py
index bb859a7a..45b5dd66 100644
--- a/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py
+++ b/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
 
 humaneval_plus_reader_cfg = dict(
     input_columns=['prompt'], output_column='task_id', train_split='test')
@@ -19,7 +19,7 @@ humaneval_plus_infer_cfg = dict(
     inferencer=dict(type=GenInferencer, max_out_len=512))
 
 humaneval_plus_eval_cfg = dict(
-    evaluator=dict(type=HumanEvaluator, metric='EvalPlus'),
+    evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
     pred_role='BOT',
     k=[1, 10, 100],  # the parameter only for humaneval
     pred_postprocessor=dict(type=humaneval_postprocess_v2),
diff --git a/configs/datasets/mmlu_pro/mmlu_pro_categories.py b/configs/datasets/mmlu_pro/mmlu_pro_categories.py
new file mode 100644
index 00000000..eff38983
--- /dev/null
+++ b/configs/datasets/mmlu_pro/mmlu_pro_categories.py
@@ -0,0 +1,16 @@
+categories = [
+    'math',
+    'physics',
+    'chemistry',
+    'law',
+    'engineering',
+    'other',
+    'economics',
+    'health',
+    'psychology',
+    'business',
+    'biology',
+    'philosophy',
+    'computer science',
+    'history',
+]
diff --git a/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py b/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py
new file mode 100644
index 00000000..7dd143b1
--- /dev/null
+++ b/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py
@@ -0,0 +1,58 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUProDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mmlu_pro_categories import categories
+
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+
+    mmlu_pro_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'),
+                dict(role='BOT', prompt="Answer: Let's think step by step. {cot_content}")
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFGHIJKLMNOP'),
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
diff --git a/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py b/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
new file mode 100644
index 00000000..d676b949
--- /dev/null
+++ b/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
@@ -0,0 +1,61 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever,  RandomRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NQOpenDataset, NQEvaluator
+
+nq_datasets = []
+for k in [1]:
+    nq_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        nq_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        nq_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50),
+        )
+
+    nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role='BOT')
+
+    nq_datasets.append(
+        dict(
+            type=NQOpenDataset,
+            abbr=f'nq_open_{k}shot',
+            path='./data/nq-open/',
+            reader_cfg=nq_reader_cfg,
+            infer_cfg=nq_infer_cfg,
+            eval_cfg=nq_eval_cfg)
+        )
diff --git a/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py b/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
new file mode 100644
index 00000000..ed072523
--- /dev/null
+++ b/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
@@ -0,0 +1,64 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'alignment_bench_v1_1',
+]
+data_path ='data/subjective/alignment_bench'
+
+alignment_bench_config_path = 'data/subjective/alignment_bench/config'
+alignment_bench_config_name = 'multi-dimension'
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            alignment_bench_config_path=alignment_bench_config_path,
+            alignment_bench_config_name=alignment_bench_config_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare.py b/configs/datasets/subjective/compassbench/compassbench_compare.py
index 942eca7c..b015d18d 100644
--- a/configs/datasets/subjective/compassbench/compassbench_compare.py
+++ b/configs/datasets/subjective/compassbench/compassbench_compare.py
@@ -13,7 +13,7 @@ data_path ='data/subjective/compassbench'
 
 subjective_datasets = []
 
-versions = ['CompassbenchV1']
+versions = ['CompassBenchV1.1']
 
 for version_abbr in versions:
     subjective_infer_cfg = dict(
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare_v11.py b/configs/datasets/subjective/compassbench/compassbench_compare_v11.py
new file mode 100644
index 00000000..2831eae6
--- /dev/null
+++ b/configs/datasets/subjective/compassbench/compassbench_compare_v11.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchControlLengthBiasDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'judge_prompt'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compassbench'
+
+subjective_datasets = []
+
+versions = ['CompassBenchV1.1']
+
+for version_abbr in versions:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=version_abbr,
+            type=CompassBenchControlLengthBiasDataset,
+            path=data_path,
+            name=version_abbr,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare_v11_patch.py b/configs/datasets/subjective/compassbench/compassbench_compare_v11_patch.py
new file mode 100644
index 00000000..176845a4
--- /dev/null
+++ b/configs/datasets/subjective/compassbench/compassbench_compare_v11_patch.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchControlLengthBiasDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'judge_prompt'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compassbench'
+
+subjective_datasets = []
+
+versions = ['CompassBenchV1.1.patch', 'CompassBenchV1.1.patch.en']
+
+for version_abbr in versions:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=version_abbr,
+            type=CompassBenchControlLengthBiasDataset,
+            path=data_path,
+            name=version_abbr,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare_v12.py b/configs/datasets/subjective/compassbench/compassbench_compare_v12.py
new file mode 100644
index 00000000..6f283bd0
--- /dev/null
+++ b/configs/datasets/subjective/compassbench/compassbench_compare_v12.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchControlLengthBiasDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'judge_prompt'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compassbench'
+
+subjective_datasets = []
+
+versions = ['CompassBenchV1.2']
+
+for version_abbr in versions:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=version_abbr,
+            type=CompassBenchControlLengthBiasDataset,
+            path=data_path,
+            name=version_abbr,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/datasets/taco/taco_staged_gen_411572.py b/configs/datasets/taco/taco_levels_gen_411572.py
similarity index 100%
rename from configs/datasets/taco/taco_staged_gen_411572.py
rename to configs/datasets/taco/taco_levels_gen_411572.py
diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py
new file mode 100644
index 00000000..156534dd
--- /dev/null
+++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_bc5f21.py
@@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADataset_V2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='./data/triviaqa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )
diff --git a/configs/eval_hf_llama2.py b/configs/eval_hf_llama2.py
index 129859e5..ccc2efd6 100644
--- a/configs/eval_hf_llama2.py
+++ b/configs/eval_hf_llama2.py
@@ -5,7 +5,7 @@ with read_base():
     from .datasets.triviaqa.triviaqa_wiki_gen_d18bf4 import triviaqa_datasets
     from .datasets.nq.nq_open_gen_e93f8a import nq_datasets
     from .datasets.gsm8k.gsm8k_gen_3309bd import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
     from .datasets.agieval.agieval_mixed_713d14 import agieval_datasets
     from .datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
     from .datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
diff --git a/configs/eval_internlm2_keyset.py b/configs/eval_internlm2_keyset.py
index 4a8d92b8..54e0c1b5 100644
--- a/configs/eval_internlm2_keyset.py
+++ b/configs/eval_internlm2_keyset.py
@@ -6,7 +6,7 @@ with read_base():
     from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
     from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
     from .datasets.math.math_gen_265cce import math_datasets
-    from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
     from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
 
     from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model
diff --git a/configs/eval_lightllm.py b/configs/eval_lightllm.py
index fe45c14c..ad0fcee8 100644
--- a/configs/eval_lightllm.py
+++ b/configs/eval_lightllm.py
@@ -6,7 +6,7 @@ from opencompass.tasks import OpenICLInferTask
 
 with read_base():
     from .summarizers.leaderboard import summarizer
-    from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
 
 datasets = [*humaneval_datasets]
 
diff --git a/configs/eval_mixtral_8x7b.py b/configs/eval_mixtral_8x7b.py
deleted file mode 100644
index cd64eb0c..00000000
--- a/configs/eval_mixtral_8x7b.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    from .datasets.collections.base_medium_llama import piqa_datasets, siqa_datasets
-    from .models.mixtral.mixtral_8x7b_32k import models
-
-
-datasets = [*piqa_datasets, *siqa_datasets]
diff --git a/configs/eval_mmlu_pro.py b/configs/eval_mmlu_pro.py
new file mode 100644
index 00000000..ed8cd08e
--- /dev/null
+++ b/configs/eval_mmlu_pro.py
@@ -0,0 +1,34 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .datasets.mmlu_pro.mmlu_pro_gen_cdbebf import mmlu_pro_datasets
+
+    from .models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model
+    from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+
+    from .summarizers.mmlu_pro import summarizer
+    from .internal.clusters.local import infer_num_worker as infer
+    from .internal.clusters.local import eval
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+work_dir = 'outputs/debug/mmlu_pro'
+
+# dataset                    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind
+# -------------------------  ---------  -------------  ------  -----------------------------  -------------------------------
+# mmlu_pro                   -          naive_average  gen                             46.18                            43.92
+# mmlu_pro_biology           736233     accuracy       gen                             63.74                            64.02
+# mmlu_pro_business          736233     accuracy       gen                             53.23                            46.01
+# mmlu_pro_chemistry         736233     accuracy       gen                             35.25                            32.42
+# mmlu_pro_computer_science  736233     accuracy       gen                             47.07                            44.88
+# mmlu_pro_economics         736233     accuracy       gen                             59.00                            53.79
+# mmlu_pro_engineering       736233     accuracy       gen                             26.73                            33.54
+# mmlu_pro_health            736233     accuracy       gen                             47.31                            51.34
+# mmlu_pro_history           736233     accuracy       gen                             42.78                            42.26
+# mmlu_pro_law               736233     accuracy       gen                             28.07                            26.98
+# mmlu_pro_math              736233     accuracy       gen                             53.59                            37.53
+# mmlu_pro_philosophy        736233     accuracy       gen                             42.28                            42.48
+# mmlu_pro_physics           736233     accuracy       gen                             39.11                            33.64
+# mmlu_pro_psychology        736233     accuracy       gen                             60.90                            59.65
+# mmlu_pro_other             736233     accuracy       gen                             47.40                            46.32
diff --git a/configs/models/chatglm/hf_glm4_9b_chat.py b/configs/models/chatglm/hf_glm4_9b_chat.py
new file mode 100644
index 00000000..67d7d5c3
--- /dev/null
+++ b/configs/models/chatglm/hf_glm4_9b_chat.py
@@ -0,0 +1,13 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='glm-4-9b-chat-hf',
+        path='THUDM/glm-4-9b-chat',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
+    )
+]
diff --git a/configs/models/chatglm/vllm_chatglm3_6b.py b/configs/models/chatglm/vllm_chatglm3_6b.py
new file mode 100644
index 00000000..63363b8a
--- /dev/null
+++ b/configs/models/chatglm/vllm_chatglm3_6b.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='chatglm3-6b-vllm',
+        path='THUDM/chatglm3-6b',
+        max_out_len=1024,
+        batch_size=16,
+        model_kwargs=dict(tensor_parallel_size=1),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/chatglm/vllm_glm4_9b_chat.py b/configs/models/chatglm/vllm_glm4_9b_chat.py
new file mode 100644
index 00000000..c3041bd6
--- /dev/null
+++ b/configs/models/chatglm/vllm_glm4_9b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='glm-4-9b-chat-vllm',
+        path='THUDM/glm-4-9b-chat',
+        max_out_len=1024,
+        batch_size=16,
+        model_kwargs=dict(tensor_parallel_size=1),
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
+    )
+]
diff --git a/configs/models/deepseek/hf_deepseek_v2_chat.py b/configs/models/deepseek/hf_deepseek_v2_chat.py
index e8f28d78..3e871243 100644
--- a/configs/models/deepseek/hf_deepseek_v2_chat.py
+++ b/configs/models/deepseek/hf_deepseek_v2_chat.py
@@ -6,7 +6,7 @@ models = [
         abbr='deepseek-v2-chat-hf',
         path='deepseek-ai/DeepSeek-V2-Chat',
         max_out_len=1024,
-        batch_size=4,
+        batch_size=2,
         model_kwargs=dict(
             device_map='sequential',
             torch_dtype='torch.bfloat16',
diff --git a/configs/models/deepseek/hf_deepseek_v2_lite.py b/configs/models/deepseek/hf_deepseek_v2_lite.py
new file mode 100644
index 00000000..bbf4b89a
--- /dev/null
+++ b/configs/models/deepseek/hf_deepseek_v2_lite.py
@@ -0,0 +1,17 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='deepseek-v2-lite-hf',
+        path='deepseek-ai/DeepSeek-V2-Lite',
+        max_out_len=1024,
+        batch_size=4,
+        model_kwargs=dict(
+            device_map='sequential',
+            torch_dtype='torch.bfloat16',
+            attn_implementation='eager'
+        ),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/deepseek/hf_deepseek_v2_lite_chat.py b/configs/models/deepseek/hf_deepseek_v2_lite_chat.py
new file mode 100644
index 00000000..07ca19fd
--- /dev/null
+++ b/configs/models/deepseek/hf_deepseek_v2_lite_chat.py
@@ -0,0 +1,17 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-v2-lite-chat-hf',
+        path='deepseek-ai/DeepSeek-V2-Lite-Chat',
+        max_out_len=1024,
+        batch_size=4,
+        model_kwargs=dict(
+            device_map='sequential',
+            torch_dtype='torch.bfloat16',
+            attn_implementation='eager'
+        ),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/deepseek/lmdeploy_deepseek_67b_base.py b/configs/models/deepseek/lmdeploy_deepseek_67b_base.py
new file mode 100644
index 00000000..f0c87e48
--- /dev/null
+++ b/configs/models/deepseek/lmdeploy_deepseek_67b_base.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='deepseek-67b-base-turbomind',
+        path='deepseek-ai/deepseek-llm-67b-base',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py b/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py
new file mode 100644
index 00000000..e369e6e1
--- /dev/null
+++ b/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-67b-chat-turbomind',
+        path='deepseek-ai/deepseek-llm-67b-chat',
+        engine_config=dict(max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/deepseek/lmdeploy_deepseek_7b_base.py b/configs/models/deepseek/lmdeploy_deepseek_7b_base.py
new file mode 100644
index 00000000..ef48a5f0
--- /dev/null
+++ b/configs/models/deepseek/lmdeploy_deepseek_7b_base.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='deepseek-7b-base-turbomind',
+        path='deepseek-ai/deepseek-llm-7b-base',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py b/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py
new file mode 100644
index 00000000..26aa2afc
--- /dev/null
+++ b/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-7b-chat-turbomind',
+        path='deepseek-ai/deepseek-llm-7b-chat',
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/deepseek/vllm_deepseek_67b_chat.py b/configs/models/deepseek/vllm_deepseek_67b_chat.py
new file mode 100644
index 00000000..3c4d1fec
--- /dev/null
+++ b/configs/models/deepseek/vllm_deepseek_67b_chat.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='deepseek-67b-chat-vllm',
+        path='deepseek-ai/deepseek-llm-67b-chat',
+        max_out_len=1024,
+        batch_size=16,
+        model_kwargs=dict(tensor_parallel_size=4),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/deepseek/vllm_deepseek_7b_chat.py b/configs/models/deepseek/vllm_deepseek_7b_chat.py
new file mode 100644
index 00000000..21308ae6
--- /dev/null
+++ b/configs/models/deepseek/vllm_deepseek_7b_chat.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='deepseek-7b-chat-vllm',
+        path='deepseek-ai/deepseek-llm-7b-chat',
+        max_out_len=1024,
+        batch_size=16,
+        model_kwargs=dict(tensor_parallel_size=1),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/deepseek/vllm_deepseek_moe_16b_base.py b/configs/models/deepseek/vllm_deepseek_moe_16b_base.py
new file mode 100644
index 00000000..1c6097f5
--- /dev/null
+++ b/configs/models/deepseek/vllm_deepseek_moe_16b_base.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='deepseek-moe-16b-base-hf',
+        path='deepseek-ai/deepseek-moe-16b-base',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/deepseek/vllm_deepseek_moe_16b_chat.py b/configs/models/deepseek/vllm_deepseek_moe_16b_chat.py
new file mode 100644
index 00000000..e88fa992
--- /dev/null
+++ b/configs/models/deepseek/vllm_deepseek_moe_16b_chat.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='deepseek-moe-16b-chat-vllm',
+        path='deepseek-ai/deepseek-moe-16b-chat',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6),
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/gemma/vllm_gemma_2b.py b/configs/models/gemma/vllm_gemma_2b.py
new file mode 100644
index 00000000..75faea2a
--- /dev/null
+++ b/configs/models/gemma/vllm_gemma_2b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='gemma-2b-vllm',
+        path='google/gemma-2b',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/gemma/vllm_gemma_2b_it.py b/configs/models/gemma/vllm_gemma_2b_it.py
new file mode 100644
index 00000000..347a39fc
--- /dev/null
+++ b/configs/models/gemma/vllm_gemma_2b_it.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-2b-it-vllm',
+        path='google/gemma-2b-it',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/gemma/vllm_gemma_7b.py b/configs/models/gemma/vllm_gemma_7b.py
new file mode 100644
index 00000000..818f18eb
--- /dev/null
+++ b/configs/models/gemma/vllm_gemma_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='gemma-7b-vllm',
+        path='google/gemma-7b',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/gemma/vllm_gemma_7b_it.py b/configs/models/gemma/vllm_gemma_7b_it.py
new file mode 100644
index 00000000..934aad8a
--- /dev/null
+++ b/configs/models/gemma/vllm_gemma_7b_it.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-7b-it-vllm',
+        path='google/gemma-7b-it',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/README.md b/configs/models/hf_internlm/README.md
index 3c4b19d0..7f691785 100644
--- a/configs/models/hf_internlm/README.md
+++ b/configs/models/hf_internlm/README.md
@@ -26,7 +26,7 @@ python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
 python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
 python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
 python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
-python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
 python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
 python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug
 python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_1_8b.py b/configs/models/hf_internlm/lmdeploy_internlm2_1_8b.py
new file mode 100644
index 00000000..fed3b5b8
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm2-1.8b-turbomind',
+        path='internlm/internlm2-1_8b',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_20b.py
index 875d7bbd..e3441a05 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_20b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_20b.py
@@ -1,27 +1,15 @@
-from opencompass.models.turbomind import TurboMindModel
-
+from opencompass.models import TurboMindModel
 
 models = [
     dict(
         type=TurboMindModel,
         abbr='internlm2-20b-turbomind',
         path='internlm/internlm2-20b',
-        engine_config=dict(
-            session_len=32768,
-            max_batch_size=32,
-            model_name='internlm2-20b',
-            tp=2,
-        ),
-        gen_config=dict(
-            top_k=1,
-            top_p=0.8,
-            temperature=1.0,
-            max_new_tokens=2000,
-        ),
-        max_out_len=2000,
-        max_seq_len=32768,
-        batch_size=32,
-        concurrency=8,
-        run_cfg=dict(num_gpus=2, num_procs=1),
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
     )
 ]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_7b.py
new file mode 100644
index 00000000..b5f23b13
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm2-7b-turbomind',
+        path='internlm/internlm2-7b',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_base_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_base_20b.py
new file mode 100644
index 00000000..a66e68a3
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_base_20b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm2-base-20b-turbomind',
+        path='internlm/internlm2-base-20b',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_base_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_base_7b.py
new file mode 100644
index 00000000..5577935e
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_base_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm2-base-7b-turbomind',
+        path='internlm/internlm2-base-7b',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
index 346e9341..9c358d5a 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
@@ -5,19 +5,11 @@ models = [
         type=TurboMindModelwithChatTemplate,
         abbr='internlm2-chat-1.8b-turbomind',
         path='internlm/internlm2-chat-1_8b',
-        engine_config=dict(
-            max_batch_size=16,
-            tp=1,
-        ),
-        gen_config=dict(
-            top_k=1,
-            temperature=1e-6,
-            top_p=0.9,
-        ),
-        max_seq_len=2048,
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
         max_out_len=1024,
-        batch_size=32768,
+        batch_size=16,
         run_cfg=dict(num_gpus=1),
-        stop_words=['</s>', '<|im_end|>'],
     )
 ]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b_sft.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b_sft.py
new file mode 100644
index 00000000..47c163a3
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b_sft.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2-chat-1.8b-sft-turbomind',
+        path='internlm/internlm2-chat-1_8b-sft',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py
index f5c7b951..44371549 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py
@@ -1,36 +1,15 @@
-from opencompass.models.turbomind import TurboMindModel
-
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-)
+from opencompass.models import TurboMindModelwithChatTemplate
 
 models = [
     dict(
-        type=TurboMindModel,
+        type=TurboMindModelwithChatTemplate,
         abbr='internlm2-chat-20b-turbomind',
         path='internlm/internlm2-chat-20b',
-        meta_template=_meta_template,
-        engine_config=dict(
-            session_len=32768,
-            max_batch_size=32,
-            model_name='internlm2-chat-20b',
-            tp=2,
-            stop_words=[2, 92542],
-        ),
-        gen_config=dict(
-            top_k=1,
-            top_p=0.8,
-            temperature=1.0,
-            max_new_tokens=2000,
-        ),
-        max_out_len=2000,
-        max_seq_len=32768,
-        batch_size=32,
-        concurrency=8,
-        run_cfg=dict(num_gpus=2, num_procs=1),
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
     )
 ]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b_sft.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b_sft.py
new file mode 100644
index 00000000..fa8ede0a
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b_sft.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2-chat-20b-sft-turbomind',
+        path='internlm/internlm2-chat-20b-sft',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
index b604a04c..82ad2e46 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
@@ -5,19 +5,11 @@ models = [
         type=TurboMindModelwithChatTemplate,
         abbr='internlm2-chat-7b-turbomind',
         path='internlm/internlm2-chat-7b',
-        engine_config=dict(
-            max_batch_size=16,
-            tp=1,
-        ),
-        gen_config=dict(
-            top_k=1,
-            temperature=1e-6,
-            top_p=0.9,
-        ),
-        max_seq_len=2048,
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
         max_out_len=1024,
-        batch_size=32768,
+        batch_size=16,
         run_cfg=dict(num_gpus=1),
-        stop_words=['</s>', '<|im_end|>'],
     )
 ]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b_sft.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b_sft.py
new file mode 100644
index 00000000..09c69758
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b_sft.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2-chat-7b-sft-turbomind',
+        path='internlm/internlm2-chat-7b-sft',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm_20b.py b/configs/models/hf_internlm/lmdeploy_internlm_20b.py
new file mode 100644
index 00000000..761e90cf
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm_20b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm-20b-turbomind',
+        path='internlm/internlm-20b',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm_7b.py b/configs/models/hf_internlm/lmdeploy_internlm_7b.py
new file mode 100644
index 00000000..6f2f4cc0
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internlm-7b-turbomind',
+        path='internlm/internlm-7b',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py
new file mode 100644
index 00000000..8718a6cf
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm-chat-20b-turbomind',
+        path='internlm/internlm-chat-20b',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py
new file mode 100644
index 00000000..ea61313a
--- /dev/null
+++ b/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm-chat-7b-turbomind',
+        path='internlm/internlm-chat-7b',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_chat_1_8b.py b/configs/models/hf_internlm/vllm_internlm2_chat_1_8b.py
new file mode 100644
index 00000000..d438cdcd
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_chat_1_8b.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='internlm2-chat-1.8b-vllm',
+        path='internlm/internlm2-chat-1_8b',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_chat_1_8b_sft.py b/configs/models/hf_internlm/vllm_internlm2_chat_1_8b_sft.py
new file mode 100644
index 00000000..b9dd1a62
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_chat_1_8b_sft.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='internlm2-chat-1.8b-sft-vllm',
+        path='internlm/internlm2-chat-1_8b-sft',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_chat_20b.py b/configs/models/hf_internlm/vllm_internlm2_chat_20b.py
new file mode 100644
index 00000000..d025fa27
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_chat_20b.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='internlm2-chat-20b-vllm',
+        path='internlm/internlm2-chat-20b',
+        model_kwargs=dict(tensor_parallel_size=2),
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_chat_20b_sft.py b/configs/models/hf_internlm/vllm_internlm2_chat_20b_sft.py
new file mode 100644
index 00000000..f2b373e1
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_chat_20b_sft.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='internlm2-chat-20b-sft-vllm',
+        path='internlm/internlm2-chat-20b-sft',
+        model_kwargs=dict(tensor_parallel_size=2),
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_chat_7b_sft.py b/configs/models/hf_internlm/vllm_internlm2_chat_7b_sft.py
new file mode 100644
index 00000000..9f07c93e
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_chat_7b_sft.py
@@ -0,0 +1,13 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='internlm2-chat-7b-sft-vllm',
+        path='internlm/internlm2-chat-7b-sft',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_internlm/vllm_internlm2_series.py b/configs/models/hf_internlm/vllm_internlm2_series.py
new file mode 100644
index 00000000..aee5a685
--- /dev/null
+++ b/configs/models/hf_internlm/vllm_internlm2_series.py
@@ -0,0 +1,25 @@
+from opencompass.models import VLLM
+
+settings = [
+    ('internlm2-1.8b-vllm', 'internlm/internlm2-1_8b', 1),
+    ('internlm2-7b-vllm', 'internlm/internlm2-7b', 1),
+    ('internlm2-base-7b-vllm', 'internlm/internlm2-base-7b', 1),
+    ('internlm2-20b-vllm', 'internlm/internlm2-20b', 2),
+    ('internlm2-base-20b-vllm', 'internlm/internlm2-base-20b', 2),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus),
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/models/hf_llama/lmdeploy_llama2_13b.py b/configs/models/hf_llama/lmdeploy_llama2_13b.py
new file mode 100644
index 00000000..379a6513
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_13b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-2-13b-turbomind',
+        path='meta-llama/Llama-2-13b-hf',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py
new file mode 100644
index 00000000..cb42cb29
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='llama-2-13b-chat-turbomind',
+        path='meta-llama/Llama-2-13b-chat-hf',
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama2_70b.py b/configs/models/hf_llama/lmdeploy_llama2_70b.py
new file mode 100644
index 00000000..90ad556d
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_70b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-2-70b-turbomind',
+        path='meta-llama/Llama-2-70b-hf',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py
new file mode 100644
index 00000000..d6c69c6f
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='llama-2-70b-chat-turbomind',
+        path='meta-llama/Llama-2-70b-chat-hf',
+        engine_config=dict(max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama2_7b.py b/configs/models/hf_llama/lmdeploy_llama2_7b.py
new file mode 100644
index 00000000..0f2941df
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-2-7b-turbomind',
+        path='meta-llama/Llama-2-7b-hf',
+        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py
new file mode 100644
index 00000000..f520ce8b
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='llama-2-7b-chat-turbomind',
+        path='meta-llama/Llama-2-7b-chat-hf',
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=4096,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama3_70b.py b/configs/models/hf_llama/lmdeploy_llama3_70b.py
new file mode 100644
index 00000000..f16dc221
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama3_70b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-3-70b-turbomind',
+        path='meta-llama/Meta-Llama-3-70B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4, cache_max_entry_count=0.7),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py
index 8e51baae..333dc015 100644
--- a/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py
+++ b/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py
@@ -1,24 +1,16 @@
-from opencompass.models import TurboMindModel
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|begin_of_text|>user<|end_header_id|>\n\n', end='<|eot_id|>'),
-        dict(role='BOT', begin='<|begin_of_text|>assistant<|end_header_id|>\n\n', end='<|eot_id|>', generate=True),
-    ],
-)
+from opencompass.models import TurboMindModelwithChatTemplate
 
 models = [
     dict(
-        type=TurboMindModel,
-        abbr='llama-3-70b-instruct-lmdeploy',
+        type=TurboMindModelwithChatTemplate,
+        abbr='llama-3-70b-instruct-turbomind',
         path='meta-llama/Meta-Llama-3-70B-Instruct',
-        engine_config=dict(session_len=4096, max_batch_size=16, tp=4),
-        gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024, stop_words=[128001, 128009]),
+        engine_config=dict(max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
         max_out_len=1024,
-        max_seq_len=4096,
         batch_size=16,
-        concurrency=16,
-        meta_template=_meta_template,
         run_cfg=dict(num_gpus=4),
+        stop_words=['<|end_of_text|>', '<|eot_id|>'],
     )
 ]
diff --git a/configs/models/hf_llama/lmdeploy_llama3_8b.py b/configs/models/hf_llama/lmdeploy_llama3_8b.py
new file mode 100644
index 00000000..141e6b35
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama3_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-3-8b-turbomind',
+        path='meta-llama/Meta-Llama-3-8B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py
index ff8be649..cc5b3bd4 100644
--- a/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py
+++ b/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py
@@ -1,24 +1,16 @@
-from opencompass.models import TurboMindModel
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|begin_of_text|>user<|end_header_id|>\n\n', end='<|eot_id|>'),
-        dict(role='BOT', begin='<|begin_of_text|>assistant<|end_header_id|>\n\n', end='<|eot_id|>', generate=True),
-    ],
-)
+from opencompass.models import TurboMindModelwithChatTemplate
 
 models = [
     dict(
-        type=TurboMindModel,
-        abbr='llama-3-8b-instruct-lmdeploy',
+        type=TurboMindModelwithChatTemplate,
+        abbr='llama-3-8b-instruct-turbomind',
         path='meta-llama/Meta-Llama-3-8B-Instruct',
-        engine_config=dict(session_len=4096, max_batch_size=16, tp=1),
-        gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024, stop_words=[128001, 128009]),
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
         max_out_len=1024,
-        max_seq_len=4096,
         batch_size=16,
-        concurrency=16,
-        meta_template=_meta_template,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|end_of_text|>', '<|eot_id|>'],
     )
 ]
diff --git a/configs/models/hf_llama/lmdeploy_llama_13b.py b/configs/models/hf_llama/lmdeploy_llama_13b.py
new file mode 100644
index 00000000..bd7f91cb
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama_13b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-13b-turbomind',
+        path='huggyllama/llama-13b',
+        engine_config=dict(session_len=2048, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=2048,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama_30b.py b/configs/models/hf_llama/lmdeploy_llama_30b.py
new file mode 100644
index 00000000..375ff9e7
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama_30b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-30b-turbomind',
+        path='huggyllama/llama-30b',
+        engine_config=dict(session_len=2048, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=2048,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama_65b.py b/configs/models/hf_llama/lmdeploy_llama_65b.py
new file mode 100644
index 00000000..e91cb0a0
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama_65b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-65b-turbomind',
+        path='huggyllama/llama-65b',
+        engine_config=dict(session_len=2048, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=2048,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama_7b.py b/configs/models/hf_llama/lmdeploy_llama_7b.py
new file mode 100644
index 00000000..f215215b
--- /dev/null
+++ b/configs/models/hf_llama/lmdeploy_llama_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='llama-7b-turbomind',
+        path='huggyllama/llama-7b',
+        engine_config=dict(session_len=2048, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=2048,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/hf_llama/lmdeploy_llama_series.py b/configs/models/hf_llama/lmdeploy_llama_series.py
deleted file mode 100644
index 346d920d..00000000
--- a/configs/models/hf_llama/lmdeploy_llama_series.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from opencompass.models import TurboMindModel
-
-settings = [
-    ('llama-7b-turbomind', 'huggyllama/llama-7b', 1),
-    ('llama-13b-turbomind', 'huggyllama/llama-13b', 1),
-    ('llama-30b-turbomind', 'huggyllama/llama-30b', 2),
-    ('llama-65b-turbomind', 'huggyllama/llama-65b', 4),
-    ('llama-2-7b-turbomind', 'meta-llama/Llama-2-7b-hf', 1),
-    ('llama-2-13b-turbomind', 'meta-llama/Llama-2-13b-hf', 1),
-    ('llama-2-70b-turbomind', 'meta-llama/Llama-2-70b-hf', 4),
-    ('llama-3-8b-turbomind', 'meta-llama/Meta-Llama-3-8B', 1),
-    ('llama-3-70b-turbomind', 'meta-llama/Meta-Llama-3-70B', 4),
-]
-
-models = []
-for abbr, path, num_gpus in settings:
-    models.append(
-        dict(
-            type=TurboMindModel,
-            abbr=abbr,
-            path=path,
-            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
-            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
-            max_out_len=1024,
-            max_seq_len=2048,
-            batch_size=16,
-            concurrency=16,
-            run_cfg=dict(num_gpus=num_gpus),
-        )
-    )
diff --git a/configs/models/hf_llama/vllm_llama_series.py b/configs/models/hf_llama/vllm_llama_series.py
new file mode 100644
index 00000000..3bef7159
--- /dev/null
+++ b/configs/models/hf_llama/vllm_llama_series.py
@@ -0,0 +1,29 @@
+from opencompass.models import VLLM
+
+settings = [
+    ('llama-7b-vllm', 'huggyllama/llama-7b', 1),
+    ('llama-13b-vllm', 'huggyllama/llama-13b', 1),
+    ('llama-30b-vllm', 'huggyllama/llama-30b', 2),
+    ('llama-65b-vllm', 'huggyllama/llama-65b', 4),
+    ('llama-2-7b-vllm', 'meta-llama/Llama-2-7b-hf', 1),
+    ('llama-2-13b-vllm', 'meta-llama/Llama-2-13b-hf', 1),
+    ('llama-2-70b-vllm', 'meta-llama/Llama-2-70b-hf', 4),
+    ('llama-3-8b-vllm', 'meta-llama/Meta-Llama-3-8B', 1),
+    ('llama-3-70b-vllm', 'meta-llama/Meta-Llama-3-70B', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus),
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/models/mistral/lmdeploy_mistral_series.py b/configs/models/mistral/lmdeploy_mistral_series.py
deleted file mode 100644
index 0bb07c52..00000000
--- a/configs/models/mistral/lmdeploy_mistral_series.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from opencompass.models import LmdeployPytorchModel
-
-settings = [
-    ('mistral-7b-v0.1-pytorch', 'mistralai/Mistral-7B-v0.1', 1),
-    ('mixtral-8x7b-v0.1-pytorch', 'mistralai/Mixtral-8x7B-v0.1', 2),
-    ('mixtral-8x22b-v0.1-pytorch', 'mistralai/Mixtral-8x22B-v0.1', 4),
-]
-
-models = []
-for abbr, path, num_gpus in settings:
-    models.append(
-        dict(
-            type=LmdeployPytorchModel,
-            abbr=abbr,
-            path=path,
-            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
-            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
-            max_out_len=1024,
-            max_seq_len=2048,
-            batch_size=16,
-            concurrency=16,
-            run_cfg=dict(num_gpus=num_gpus),
-        )
-    )
diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
index db4094f6..140afa06 100644
--- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
+++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py
@@ -1,25 +1,15 @@
-from opencompass.models import VLLM
+from opencompass.models import VLLMwithChatTemplate
 
 
-_meta_template = dict(
-    begin='<s>',
-    round=[
-        dict(role='HUMAN', begin='[INST]', end='[/INST]'),
-        dict(role='BOT', begin='', end='</s>', generate=True),
-    ],
-)
-
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='mistral-7b-instruct-v0.1-vllm',
         path='mistralai/Mistral-7B-Instruct-v0.1',
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=256,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['</s>'],
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
index c1dd32ba..ddaa0dd7 100644
--- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
+++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py
@@ -1,25 +1,15 @@
-from opencompass.models import VLLM
+from opencompass.models import VLLMwithChatTemplate
 
 
-_meta_template = dict(
-    begin='<s>',
-    round=[
-        dict(role='HUMAN', begin='[INST]', end='[/INST]'),
-        dict(role='BOT', begin='', end='</s>', generate=True),
-    ],
-)
-
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='mistral-7b-instruct-v0.2-vllm',
         path='mistralai/Mistral-7B-Instruct-v0.2',
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=256,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['</s>'],
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/mistral/vllm_mistral_7b_v0_1.py b/configs/models/mistral/vllm_mistral_7b_v0_1.py
index 32486ce2..7cf9406c 100644
--- a/configs/models/mistral/vllm_mistral_7b_v0_1.py
+++ b/configs/models/mistral/vllm_mistral_7b_v0_1.py
@@ -1,17 +1,15 @@
 from opencompass.models import VLLM
 
-
 models = [
     dict(
         type=VLLM,
         abbr='mistral-7b-v0.1-vllm',
         path='mistralai/Mistral-7B-v0.1',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
-        model_kwargs=dict(dtype='bfloat16'),
-        generation_kwargs=dict(temperature=0, top_p=1, max_tokens=2048, stop_token_ids=[2]),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        stop_words=['[INST]'],
+        model_kwargs=dict(dtype='bfloat16', tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/mistral/vllm_mistral_7b_v0_2.py b/configs/models/mistral/vllm_mistral_7b_v0_2.py
index 22931d9a..f7af965d 100644
--- a/configs/models/mistral/vllm_mistral_7b_v0_2.py
+++ b/configs/models/mistral/vllm_mistral_7b_v0_2.py
@@ -1,17 +1,15 @@
 from opencompass.models import VLLM
 
-
 models = [
     dict(
         type=VLLM,
         abbr='mistral-7b-v0.2-vllm',
         path='mistral-community/Mistral-7B-v0.2',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
-        model_kwargs=dict(dtype='bfloat16'),
-        generation_kwargs=dict(temperature=0, top_p=1, max_tokens=2048, stop_token_ids=[2]),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        stop_words=['[INST]'],
+        model_kwargs=dict(dtype='bfloat16', tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/mistral/vllm_mixtral_8x22b_instruct_v0_1.py b/configs/models/mistral/vllm_mixtral_8x22b_instruct_v0_1.py
new file mode 100644
index 00000000..99a1cd28
--- /dev/null
+++ b/configs/models/mistral/vllm_mixtral_8x22b_instruct_v0_1.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLMwithChatTemplate
+
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='mixtral-8x22b-instruct-v0.1-vllm',
+        path='mistralai/Mixtral-8x22B-Instruct-v0.1',
+        model_kwargs=dict(tensor_parallel_size=8),
+        max_out_len=256,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/configs/models/mistral/vllm_mixtral_8x22b_v0_1.py b/configs/models/mistral/vllm_mixtral_8x22b_v0_1.py
new file mode 100644
index 00000000..5f5536ba
--- /dev/null
+++ b/configs/models/mistral/vllm_mixtral_8x22b_v0_1.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='mixtral-8x22b-v0.1-vllm',
+        path='mistralai/Mixtral-8x22B-v0.1',
+        model_kwargs=dict(dtype='bfloat16', tensor_parallel_size=8),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
index 2bbc0f0d..56759dd6 100644
--- a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
+++ b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py
@@ -1,26 +1,15 @@
-from opencompass.models import VLLM
+from opencompass.models import VLLMwithChatTemplate
 
 
-_meta_template = dict(
-    begin='<s>',
-    round=[
-        dict(role='HUMAN', begin='[INST]', end='[/INST]'),
-        dict(role='BOT', begin='', end='</s>', generate=True),
-    ],
-)
-
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='mixtral-8x7b-instruct-v0.1-vllm',
         path='mistralai/Mixtral-8x7B-Instruct-v0.1',
         model_kwargs=dict(tensor_parallel_size=2),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        max_out_len=256,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['</s>'],
-        run_cfg=dict(num_gpus=2, num_procs=1),
+        run_cfg=dict(num_gpus=2),
     )
 ]
diff --git a/configs/models/mistral/vllm_mixtral_8x7b_v0_1.py b/configs/models/mistral/vllm_mixtral_8x7b_v0_1.py
new file mode 100644
index 00000000..76a10bb2
--- /dev/null
+++ b/configs/models/mistral/vllm_mixtral_8x7b_v0_1.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='mixtral-8x7b-v0.1-vllm',
+        path='mistralai/Mixtral-8x7B-v0.1',
+        model_kwargs=dict(dtype='bfloat16', tensor_parallel_size=2),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/nanbeige/hf_nanbeige2_16b_chat.py b/configs/models/nanbeige/hf_nanbeige2_16b_chat.py
new file mode 100644
index 00000000..4660f329
--- /dev/null
+++ b/configs/models/nanbeige/hf_nanbeige2_16b_chat.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='nanbeige2-16b-chat-hf',
+        path='Nanbeige/Nanbeige2-16B-Chat',
+        max_out_len=1024,
+        batch_size=4,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/others/hf_telechat_12b_v2.py b/configs/models/others/hf_telechat_12b_v2.py
new file mode 100644
index 00000000..f4476cdd
--- /dev/null
+++ b/configs/models/others/hf_telechat_12b_v2.py
@@ -0,0 +1,26 @@
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='<_user>'),
+        dict(role='BOT', begin='<_bot>', end='<_end>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        abbr='telechat-12b-v2-hf',
+        type=HuggingFaceCausalLM,
+        path='Tele-AI/TeleChat-12B-v2',
+        model_kwargs=dict(device_map='auto', torch_dtype='torch.bfloat16', trust_remote_code=True),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=1,
+        run_cfg=dict(num_gpus=2),
+        end_str='<_end>',
+        batch_padding=True,
+    )
+]
diff --git a/configs/models/others/hf_telechat_52b.py b/configs/models/others/hf_telechat_52b.py
new file mode 100644
index 00000000..2585dbcc
--- /dev/null
+++ b/configs/models/others/hf_telechat_52b.py
@@ -0,0 +1,26 @@
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='<_user>'),
+        dict(role='BOT', begin='<_bot>', end='<_end>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        abbr='telechat-52b-hf',
+        type=HuggingFaceCausalLM,
+        path='Tele-AI/TeleChat-52B',
+        model_kwargs=dict(device_map='auto', torch_dtype='torch.bfloat16', trust_remote_code=True),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=1,
+        run_cfg=dict(num_gpus=4),
+        end_str='<_end>',
+        batch_padding=True,
+    )
+]
diff --git a/configs/models/others/hf_telechat_7b_chat.py b/configs/models/others/hf_telechat_7b.py
similarity index 59%
rename from configs/models/others/hf_telechat_7b_chat.py
rename to configs/models/others/hf_telechat_7b.py
index 05409891..1fbfaa62 100644
--- a/configs/models/others/hf_telechat_7b_chat.py
+++ b/configs/models/others/hf_telechat_7b.py
@@ -10,19 +10,11 @@ _meta_template = dict(
 
 models = [
     dict(
-        abbr='telechat-7b-hf--rerun',
+        abbr='telechat-7b-hf',
         type=HuggingFaceCausalLM,
         path='Tele-AI/telechat-7B',
-        tokenizer_path='Tele-AI/telechat-7B',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True,
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True),
         meta_template=_meta_template,
         max_out_len=100,
         max_seq_len=2048,
diff --git a/configs/models/others/vllm_dbrx_instruct.py b/configs/models/others/vllm_dbrx_instruct.py
new file mode 100644
index 00000000..70d2736f
--- /dev/null
+++ b/configs/models/others/vllm_dbrx_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='dbrx-instruct-vllm',
+        path='databricks/dbrx-instruct',
+        model_kwargs=dict(tensor_parallel_size=8),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/configs/models/phi/hf_phi_3_medium_4k_instruct.py b/configs/models/phi/hf_phi_3_medium_4k_instruct.py
new file mode 100644
index 00000000..444f5ae0
--- /dev/null
+++ b/configs/models/phi/hf_phi_3_medium_4k_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='phi-3-medium-4k-instruct-hf',
+        path='microsoft/Phi-3-medium-4k-instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/phi/hf_phi_3_mini_4k_instruct.py b/configs/models/phi/hf_phi_3_mini_4k_instruct.py
new file mode 100644
index 00000000..e9c4544c
--- /dev/null
+++ b/configs/models/phi/hf_phi_3_mini_4k_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='phi-3-mini-4k-instruct-hf',
+        path='microsoft/Phi-3-mini-4k-instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/phi/hf_phi_3_small_8k_instruct.py b/configs/models/phi/hf_phi_3_small_8k_instruct.py
new file mode 100644
index 00000000..89b0fa76
--- /dev/null
+++ b/configs/models/phi/hf_phi_3_small_8k_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='phi-3-small-8k-instruct-hf',
+        path='microsoft/Phi-3-small-8k-instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/README.md b/configs/models/qwen/README.md
index f8dbd8ad..3db35a26 100644
--- a/configs/models/qwen/README.md
+++ b/configs/models/qwen/README.md
@@ -32,7 +32,7 @@ python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug
 python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug
 python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug
 python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
-python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_qwen1_5_7b --datasets deprecated_humaneval_gen_d2537e --debug
 python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug
 python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug
 python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug
diff --git a/configs/models/qwen/hf_qwen1_5_0_5b_chat.py b/configs/models/qwen/hf_qwen1_5_0_5b_chat.py
index eb5c22da..98d2eb7f 100644
--- a/configs/models/qwen/hf_qwen1_5_0_5b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_0_5b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_110b_chat.py b/configs/models/qwen/hf_qwen1_5_110b_chat.py
index 2924a8e3..dc150bdb 100644
--- a/configs/models/qwen/hf_qwen1_5_110b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_110b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=8),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_14b_chat.py b/configs/models/qwen/hf_qwen1_5_14b_chat.py
index 522a82c0..35055aa8 100644
--- a/configs/models/qwen/hf_qwen1_5_14b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_14b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=2),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_1_8b_chat.py b/configs/models/qwen/hf_qwen1_5_1_8b_chat.py
index e1682e07..37607b1d 100644
--- a/configs/models/qwen/hf_qwen1_5_1_8b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_1_8b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_32b_chat.py b/configs/models/qwen/hf_qwen1_5_32b_chat.py
index 03506d13..51f452cf 100644
--- a/configs/models/qwen/hf_qwen1_5_32b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_32b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=2),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_4b_chat.py b/configs/models/qwen/hf_qwen1_5_4b_chat.py
index 32475a5f..f5157c9b 100644
--- a/configs/models/qwen/hf_qwen1_5_4b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_4b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_72b_chat.py b/configs/models/qwen/hf_qwen1_5_72b_chat.py
index 9f5477f7..1e293676 100644
--- a/configs/models/qwen/hf_qwen1_5_72b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_72b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=8),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_7b_chat.py b/configs/models/qwen/hf_qwen1_5_7b_chat.py
index b62c3bed..0ac8254f 100644
--- a/configs/models/qwen/hf_qwen1_5_7b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_7b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
index d33ea704..01184693 100644
--- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
@@ -8,5 +8,6 @@ models = [
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen2_0_5b_instruct.py b/configs/models/qwen/hf_qwen2_0_5b_instruct.py
new file mode 100644
index 00000000..76e0dbba
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_0_5b_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='qwen2-0.5b-instruct-hf',
+        path='Qwen/Qwen2-0.5B-Instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/hf_qwen2_1_5b_instruct.py b/configs/models/qwen/hf_qwen2_1_5b_instruct.py
new file mode 100644
index 00000000..89fb082c
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_1_5b_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='qwen2-1.5b-instruct-hf',
+        path='Qwen/Qwen2-1.5B-Instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/hf_qwen2_moe_57b_a14b.py b/configs/models/qwen/hf_qwen2_57b_a14b.py
similarity index 71%
rename from configs/models/qwen/hf_qwen2_moe_57b_a14b.py
rename to configs/models/qwen/hf_qwen2_57b_a14b.py
index adc0ba8f..1e5f51a7 100644
--- a/configs/models/qwen/hf_qwen2_moe_57b_a14b.py
+++ b/configs/models/qwen/hf_qwen2_57b_a14b.py
@@ -3,8 +3,8 @@ from opencompass.models import HuggingFaceBaseModel
 models = [
     dict(
         type=HuggingFaceBaseModel,
-        abbr='qwen2-moe-57b-a14b-hf',
-        path='Qwen/Qwen2-MoE-57B-A14B',
+        abbr='qwen2-57b-a14b-hf',
+        path='Qwen/Qwen2-57B-A14B',
         max_out_len=1024,
         batch_size=8,
         run_cfg=dict(num_gpus=4),
diff --git a/configs/models/qwen/hf_qwen2_7b_instruct.py b/configs/models/qwen/hf_qwen2_7b_instruct.py
new file mode 100644
index 00000000..490caeb1
--- /dev/null
+++ b/configs/models/qwen/hf_qwen2_7b_instruct.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='qwen2-7b-instruct-hf',
+        path='Qwen/Qwen2-7B-Instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/hf_qwen_14b_chat.py b/configs/models/qwen/hf_qwen_14b_chat.py
index 781451e1..640eb578 100644
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@@ -1,31 +1,13 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import HuggingFacewithChatTemplate
 
 models = [
     dict(
-        type=HuggingFaceCausalLM,
+        type=HuggingFacewithChatTemplate,
         abbr='qwen-14b-chat-hf',
         path='Qwen/Qwen-14B-Chat',
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151643,
         max_out_len=1024,
-        max_seq_len=8192,
         batch_size=8,
-        batch_padding=True,
-        meta_template=_meta_template,
         run_cfg=dict(num_gpus=2),
-        end_str='<|im_end|>',
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen_1_8b_chat.py b/configs/models/qwen/hf_qwen_1_8b_chat.py
index 05b77564..3303e2e7 100644
--- a/configs/models/qwen/hf_qwen_1_8b_chat.py
+++ b/configs/models/qwen/hf_qwen_1_8b_chat.py
@@ -1,31 +1,13 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import HuggingFacewithChatTemplate
 
 models = [
     dict(
-        type=HuggingFaceCausalLM,
+        type=HuggingFacewithChatTemplate,
         abbr='qwen-1.8b-chat-hf',
         path='Qwen/Qwen-1_8B-Chat',
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151643,
         max_out_len=1024,
-        max_seq_len=8192,
         batch_size=8,
-        batch_padding=True,
-        meta_template=_meta_template,
         run_cfg=dict(num_gpus=1),
-        end_str='<|im_end|>',
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen_72b_chat.py b/configs/models/qwen/hf_qwen_72b_chat.py
index bf214abb..c1dbf5b3 100644
--- a/configs/models/qwen/hf_qwen_72b_chat.py
+++ b/configs/models/qwen/hf_qwen_72b_chat.py
@@ -1,31 +1,13 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import HuggingFacewithChatTemplate
 
 models = [
     dict(
-        type=HuggingFaceCausalLM,
+        type=HuggingFacewithChatTemplate,
         abbr='qwen-72b-chat-hf',
         path='Qwen/Qwen-72B-Chat',
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151643,
         max_out_len=1024,
-        max_seq_len=8192,
         batch_size=8,
-        batch_padding=True,
-        meta_template=_meta_template,
         run_cfg=dict(num_gpus=8),
-        end_str='<|im_end|>',
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/hf_qwen_7b_chat.py b/configs/models/qwen/hf_qwen_7b_chat.py
index 9fe97f83..f94d7e83 100644
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@@ -1,31 +1,13 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import HuggingFacewithChatTemplate
 
 models = [
     dict(
-        type=HuggingFaceCausalLM,
+        type=HuggingFacewithChatTemplate,
         abbr='qwen-7b-chat-hf',
         path='Qwen/Qwen-7B-Chat',
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151643,
         max_out_len=1024,
-        max_seq_len=8192,
-        batch_size=8,
-        batch_padding=True,
-        meta_template=_meta_template,
+        batch_size=4,
         run_cfg=dict(num_gpus=1),
-        end_str='<|im_end|>',
+        stop_words=['<|im_end|>', '<|im_start|>'],
     )
 ]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_110b.py b/configs/models/qwen/lmdeploy_qwen1_5_110b.py
new file mode 100644
index 00000000..13b3c0b5
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_110b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-110b-turbomind',
+        path='Qwen/Qwen1.5-110B',
+        engine_config=dict(session_len=7168, max_batch_size=8, tp=8, cache_max_entry_count=0.6),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py
new file mode 100644
index 00000000..9b92b814
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-110b-chat-turbomind',
+        path='Qwen/Qwen1.5-110B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=8, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_14b.py b/configs/models/qwen/lmdeploy_qwen1_5_14b.py
new file mode 100644
index 00000000..459ad1c0
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_14b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-14b-turbomind',
+        path='Qwen/Qwen1.5-14B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py
new file mode 100644
index 00000000..d2b85c2a
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-14b-chat-turbomind',
+        path='Qwen/Qwen1.5-14B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_1_8b.py b/configs/models/qwen/lmdeploy_qwen1_5_1_8b.py
new file mode 100644
index 00000000..89dbb51d
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-1.8b-turbomind',
+        path='Qwen/Qwen1.5-1.8B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py
new file mode 100644
index 00000000..ff28ac0b
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-1.8b-chat-turbomind',
+        path='Qwen/Qwen1.5-1.8B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_32b.py b/configs/models/qwen/lmdeploy_qwen1_5_32b.py
new file mode 100644
index 00000000..51146100
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_32b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-32b-turbomind',
+        path='Qwen/Qwen1.5-32B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2, cache_max_entry_count=0.7),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py
new file mode 100644
index 00000000..1196548a
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-32b-chat-turbomind',
+        path='Qwen/Qwen1.5-32B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_4b.py b/configs/models/qwen/lmdeploy_qwen1_5_4b.py
new file mode 100644
index 00000000..298152fa
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_4b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-4b-turbomind',
+        path='Qwen/Qwen1.5-4B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py
new file mode 100644
index 00000000..bde14a29
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-4b-chat-turbomind',
+        path='Qwen/Qwen1.5-4B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_72b.py b/configs/models/qwen/lmdeploy_qwen1_5_72b.py
new file mode 100644
index 00000000..892e1825
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_72b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-72b-turbomind',
+        path='Qwen/Qwen1.5-72B',
+        engine_config=dict(session_len=7168, max_batch_size=4, tp=4, cache_max_entry_count=0.7),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=4,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py
new file mode 100644
index 00000000..38175eaf
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-72b-chat-turbomind',
+        path='Qwen/Qwen1.5-72B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_7b.py b/configs/models/qwen/lmdeploy_qwen1_5_7b.py
new file mode 100644
index 00000000..b4290806
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen1.5-7b-turbomind',
+        path='Qwen/Qwen1.5-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py
new file mode 100644
index 00000000..ca733c0b
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen1.5-7b-chat-turbomind',
+        path='Qwen/Qwen1.5-7B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen1_5_series.py b/configs/models/qwen/lmdeploy_qwen1_5_series.py
index ab213114..fc49582e 100644
--- a/configs/models/qwen/lmdeploy_qwen1_5_series.py
+++ b/configs/models/qwen/lmdeploy_qwen1_5_series.py
@@ -1,30 +1,30 @@
-from opencompass.models import LmdeployPytorchModel
+from opencompass.models import TurboMindModel
 
 settings = [
-    ('qwen1.5-0.5b-pytorch', 'Qwen/Qwen1.5-0.5B', 1),
-    ('qwen1.5-1.8b-pytorch', 'Qwen/Qwen1.5-1.8B', 1),
-    ('qwen1.5-4b-pytorch', 'Qwen/Qwen1.5-4B', 1),
-    ('qwen1.5-7b-pytorch', 'Qwen/Qwen1.5-7B', 1),
-    ('qwen1.5-14b-pytorch', 'Qwen/Qwen1.5-14B', 1),
-    ('qwen1.5-32b-pytorch', 'Qwen/Qwen1.5-32B', 2),
-    ('qwen1.5-72b-pytorch', 'Qwen/Qwen1.5-72B', 4),
-    ('qwen1.5-110b-pytorch', 'Qwen/Qwen1.5-110B', 4),
-    ('qwen1.5-moe-a2.7b-pytorch', 'Qwen/Qwen1.5-MoE-A2.7B', 1),
+    # ('qwen1.5-0.5b-turbomind', 'Qwen/Qwen1.5-0.5B', 1),
+    ('qwen1.5-1.8b-turbomind', 'Qwen/Qwen1.5-1.8B', 1),
+    ('qwen1.5-4b-turbomind', 'Qwen/Qwen1.5-4B', 1),
+    ('qwen1.5-7b-turbomind', 'Qwen/Qwen1.5-7B', 1),
+    ('qwen1.5-14b-turbomind', 'Qwen/Qwen1.5-14B', 1),
+    ('qwen1.5-32b-turbomind', 'Qwen/Qwen1.5-32B', 2),
+    ('qwen1.5-72b-turbomind', 'Qwen/Qwen1.5-72B', 4),
+    ('qwen1.5-110b-turbomind', 'Qwen/Qwen1.5-110B', 4),
+    ('qwen1.5-moe-a2.7b-turbomind', 'Qwen/Qwen1.5-MoE-A2.7B', 1),
 ]
 
 models = []
 for abbr, path, num_gpus in settings:
     models.append(
         dict(
-            type=LmdeployPytorchModel,
+            type=TurboMindModel,
             abbr=abbr,
             path=path,
-            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
-            gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
+            engine_config=dict(session_len=7168, max_batch_size=8, tp=num_gpus),
+            gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=1024),
             max_out_len=1024,
-            max_seq_len=2048,
-            batch_size=16,
-            concurrency=16,
-            run_cfg=dict(num_gpus=num_gpus),
+            max_seq_len=7168,
+            batch_size=8,
+            concurrency=8,
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
         )
     )
diff --git a/configs/models/qwen/lmdeploy_qwen2_1_5b.py b/configs/models/qwen/lmdeploy_qwen2_1_5b.py
new file mode 100644
index 00000000..6da4573b
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_1_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2-1.5b-turbomind',
+        path='Qwen/Qwen2-1.5B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py
new file mode 100644
index 00000000..502de187
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen2-1.5b-instruct-turbomind',
+        path='Qwen/Qwen2-1.5B-Instruct',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_72b.py b/configs/models/qwen/lmdeploy_qwen2_72b.py
new file mode 100644
index 00000000..87c2a027
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_72b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2-72b-turbomind',
+        path='Qwen/Qwen2-72B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4, cache_max_entry_count=0.7),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py
new file mode 100644
index 00000000..69ecb798
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen2-72b-instruct-turbomind',
+        path='Qwen/Qwen2-72B-Instruct',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_7b.py b/configs/models/qwen/lmdeploy_qwen2_7b.py
new file mode 100644
index 00000000..fb410af6
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2-7b-turbomind',
+        path='Qwen/Qwen2-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py
new file mode 100644
index 00000000..4dff85e0
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen2-7b-instruct-turbomind',
+        path='Qwen/Qwen2-7B-Instruct',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen2_series.py b/configs/models/qwen/lmdeploy_qwen2_series.py
new file mode 100644
index 00000000..dc5c1920
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen2_series.py
@@ -0,0 +1,26 @@
+from opencompass.models import TurboMindModel
+
+settings = [
+    # ('qwen2-0.5b-turbomind', 'Qwen/Qwen2-0.5B', 1),
+    ('qwen2-1.5b-turbomind', 'Qwen/Qwen2-1.5B', 1),
+    ('qwen2-7b-turbomind', 'Qwen/Qwen2-7B', 1),
+    ('qwen2-72b-turbomind', 'Qwen/Qwen2-72B', 4),
+    ('qwen2-57b-a14b-turbomind', 'Qwen/Qwen2-57B-A14B', 2),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=TurboMindModel,
+            abbr=abbr,
+            path=path,
+            engine_config=dict(session_len=7168, max_batch_size=8, tp=num_gpus),
+            gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=1024),
+            max_out_len=1024,
+            max_seq_len=7168,
+            batch_size=8,
+            concurrency=8,
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/models/qwen/lmdeploy_qwen_14b.py b/configs/models/qwen/lmdeploy_qwen_14b.py
new file mode 100644
index 00000000..c7f0f547
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_14b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen-14b-turbomind',
+        path='Qwen/Qwen-14B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_14b_chat.py b/configs/models/qwen/lmdeploy_qwen_14b_chat.py
new file mode 100644
index 00000000..ccaee6a7
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_14b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-14b-chat-turbomind',
+        path='Qwen/Qwen-14B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_1_8b.py b/configs/models/qwen/lmdeploy_qwen_1_8b.py
new file mode 100644
index 00000000..b8b1fa4e
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen-1.8b-turbomind',
+        path='Qwen/Qwen-1_8B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_1_8b_chat.py b/configs/models/qwen/lmdeploy_qwen_1_8b_chat.py
new file mode 100644
index 00000000..8126bdce
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_1_8b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-1.8b-chat-turbomind',
+        path='Qwen/Qwen-1_8B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_72b.py b/configs/models/qwen/lmdeploy_qwen_72b.py
new file mode 100644
index 00000000..e338fee1
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_72b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen-72b-turbomind',
+        path='Qwen/Qwen-72B',
+        engine_config=dict(session_len=7168, max_batch_size=4, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=4,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_72b_chat.py b/configs/models/qwen/lmdeploy_qwen_72b_chat.py
new file mode 100644
index 00000000..f7faf627
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_72b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-72b-chat-turbomind',
+        path='Qwen/Qwen-72B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=4, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=4,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_7b.py b/configs/models/qwen/lmdeploy_qwen_7b.py
new file mode 100644
index 00000000..5d95e719
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen-7b-turbomind',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_7b_chat.py b/configs/models/qwen/lmdeploy_qwen_7b_chat.py
new file mode 100644
index 00000000..27776127
--- /dev/null
+++ b/configs/models/qwen/lmdeploy_qwen_7b_chat.py
@@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-chat-turbomind',
+        path='Qwen/Qwen-7B-Chat',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|im_end|>', '<|im_start|>'],
+    )
+]
diff --git a/configs/models/qwen/lmdeploy_qwen_series.py b/configs/models/qwen/lmdeploy_qwen_series.py
index 12b16820..33d78b4b 100644
--- a/configs/models/qwen/lmdeploy_qwen_series.py
+++ b/configs/models/qwen/lmdeploy_qwen_series.py
@@ -14,12 +14,13 @@ for abbr, path, num_gpus in settings:
             type=TurboMindModel,
             abbr=abbr,
             path=path,
-            engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
+            engine_config=dict(session_len=7168, max_batch_size=16, tp=num_gpus),
             gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
             max_out_len=1024,
-            max_seq_len=2048,
+            max_seq_len=7168,
             batch_size=16,
             concurrency=16,
             run_cfg=dict(num_gpus=num_gpus),
+            stop_words=['<|im_end|>', '<|im_start|>'],
         )
     )
diff --git a/configs/models/qwen/vllm_qwen1_5_0_5b.py b/configs/models/qwen/vllm_qwen1_5_0_5b.py
new file mode 100644
index 00000000..4822de65
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_0_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-0.5b-vllm',
+        path='Qwen/Qwen1.5-0.5B',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_0_5b_chat.py b/configs/models/qwen/vllm_qwen1_5_0_5b_chat.py
new file mode 100644
index 00000000..60ff8707
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_0_5b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-0.5b-chat-vllm',
+        path='Qwen/Qwen1.5-0.5B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_110b.py b/configs/models/qwen/vllm_qwen1_5_110b.py
new file mode 100644
index 00000000..1c302a9d
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_110b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-110b-vllm',
+        path='Qwen/Qwen1.5-110B',
+        model_kwargs=dict(tensor_parallel_size=4),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_110b_chat.py b/configs/models/qwen/vllm_qwen1_5_110b_chat.py
new file mode 100644
index 00000000..0466e59c
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_110b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-110b-chat-vllm',
+        path='Qwen/Qwen1.5-110B-Chat',
+        model_kwargs=dict(tensor_parallel_size=4),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_14b.py b/configs/models/qwen/vllm_qwen1_5_14b.py
new file mode 100644
index 00000000..f1aef793
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_14b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-14b-vllm',
+        path='Qwen/Qwen1.5-14B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_14b_chat.py b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
index b5e97d3d..72aafaae 100644
--- a/configs/models/qwen/vllm_qwen1_5_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
@@ -1,25 +1,14 @@
-from opencompass.models import VLLM
-
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-)
+from opencompass.models import VLLMwithChatTemplate
 
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='qwen1.5-14b-chat-vllm',
         path='Qwen/Qwen1.5-14B-Chat',
-        model_kwargs=dict(tensor_parallel_size=2),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['<|im_end|>'],
-        run_cfg=dict(num_gpus=2, num_procs=1),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen1_5_1_8b.py b/configs/models/qwen/vllm_qwen1_5_1_8b.py
new file mode 100644
index 00000000..6cde3122
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-1.8b-vllm',
+        path='Qwen/Qwen1.5-1.8B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_1_8b_chat.py b/configs/models/qwen/vllm_qwen1_5_1_8b_chat.py
new file mode 100644
index 00000000..25c48174
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_1_8b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-1.8b-chat-vllm',
+        path='Qwen/Qwen1.5-1.8B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_32b.py b/configs/models/qwen/vllm_qwen1_5_32b.py
new file mode 100644
index 00000000..e0d6e528
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_32b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-32b-vllm',
+        path='Qwen/Qwen1.5-32B',
+        model_kwargs=dict(tensor_parallel_size=2),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_32b_chat.py b/configs/models/qwen/vllm_qwen1_5_32b_chat.py
new file mode 100644
index 00000000..5684d0d7
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_32b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-32b-chat-vllm',
+        path='Qwen/Qwen1.5-32B-Chat',
+        model_kwargs=dict(tensor_parallel_size=2),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_4b.py b/configs/models/qwen/vllm_qwen1_5_4b.py
new file mode 100644
index 00000000..bddbbdb9
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_4b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-4b-vllm',
+        path='Qwen/Qwen1.5-4B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_4b_chat.py b/configs/models/qwen/vllm_qwen1_5_4b_chat.py
new file mode 100644
index 00000000..2ff3719a
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_4b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-4b-chat-vllm',
+        path='Qwen/Qwen1.5-4B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_72b.py b/configs/models/qwen/vllm_qwen1_5_72b.py
index 1703d2ca..43681778 100644
--- a/configs/models/qwen/vllm_qwen1_5_72b.py
+++ b/configs/models/qwen/vllm_qwen1_5_72b.py
@@ -1,16 +1,15 @@
 from opencompass.models import VLLM
 
-
 models = [
     dict(
         type=VLLM,
         abbr='qwen1.5-72b-vllm',
         path='Qwen/Qwen1.5-72B',
         model_kwargs=dict(tensor_parallel_size=4),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        run_cfg=dict(num_gpus=4, num_procs=1),
+        run_cfg=dict(num_gpus=4),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen1_5_72b_chat.py b/configs/models/qwen/vllm_qwen1_5_72b_chat.py
index 2cb2ca53..e3b70b7e 100644
--- a/configs/models/qwen/vllm_qwen1_5_72b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_72b_chat.py
@@ -1,25 +1,14 @@
-from opencompass.models import VLLM
-
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-)
+from opencompass.models import VLLMwithChatTemplate
 
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='qwen1.5-72b-chat-vllm',
         path='Qwen/Qwen1.5-72B-Chat',
         model_kwargs=dict(tensor_parallel_size=4),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        max_out_len=1024,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['<|im_end|>'],
-        run_cfg=dict(num_gpus=4, num_procs=1),
+        run_cfg=dict(num_gpus=4),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen1_5_7b.py b/configs/models/qwen/vllm_qwen1_5_7b.py
new file mode 100644
index 00000000..af358867
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-7b-vllm',
+        path='Qwen/Qwen1.5-7B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_7b_chat.py b/configs/models/qwen/vllm_qwen1_5_7b_chat.py
index f97c716f..af36ec04 100644
--- a/configs/models/qwen/vllm_qwen1_5_7b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_7b_chat.py
@@ -7,7 +7,8 @@ models = [
         path='Qwen/Qwen1.5-7B-Chat',
         model_kwargs=dict(tensor_parallel_size=1),
         max_out_len=1024,
-        batch_size=32768,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
         run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen1_5_moe_a2_7b.py b/configs/models/qwen/vllm_qwen1_5_moe_a2_7b.py
new file mode 100644
index 00000000..4ca4c48a
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_moe_a2_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen1.5-moe-a2.7b-vllm',
+        path='Qwen/Qwen1.5-MoE-A2.7B',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_moe_a2_7b_chat.py b/configs/models/qwen/vllm_qwen1_5_moe_a2_7b_chat.py
new file mode 100644
index 00000000..55e65a4b
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_moe_a2_7b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen1.5-moe-a2.7b-chat-vllm',
+        path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen1_5_series.py b/configs/models/qwen/vllm_qwen1_5_series.py
new file mode 100644
index 00000000..d9b6fc6c
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen1_5_series.py
@@ -0,0 +1,29 @@
+from opencompass.models import VLLM
+
+settings = [
+    ('qwen1.5-0.5b-vllm', 'Qwen/Qwen1.5-0.5B', 1),
+    ('qwen1.5-1.8b-vllm', 'Qwen/Qwen1.5-1.8B', 1),
+    ('qwen1.5-4b-vllm', 'Qwen/Qwen1.5-4B', 1),
+    ('qwen1.5-7b-vllm', 'Qwen/Qwen1.5-7B', 1),
+    ('qwen1.5-14b-vllm', 'Qwen/Qwen1.5-14B', 1),
+    ('qwen1.5-32b-vllm', 'Qwen/Qwen1.5-32B', 2),
+    ('qwen1.5-72b-vllm', 'Qwen/Qwen1.5-72B', 4),
+    ('qwen1.5-110b-vllm', 'Qwen/Qwen1.5-110B', 4),
+    ('qwen1.5-moe-a2.7b-vllm', 'Qwen/Qwen1.5-MoE-A2.7B', 1),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus),
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/models/qwen/vllm_qwen2_0_5b.py b/configs/models/qwen/vllm_qwen2_0_5b.py
new file mode 100644
index 00000000..877dc5bd
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_0_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen2-0.5b-vllm',
+        path='Qwen/Qwen2-0.5B',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_0_5b_instruct.py b/configs/models/qwen/vllm_qwen2_0_5b_instruct.py
new file mode 100644
index 00000000..48dd41b6
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_0_5b_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2-0.5b-instruct-vllm',
+        path='Qwen/Qwen2-0.5B-Instruct',
+        model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_1_5b.py b/configs/models/qwen/vllm_qwen2_1_5b.py
new file mode 100644
index 00000000..5f1e01f6
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_1_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen2-1.5b-vllm',
+        path='Qwen/Qwen2-1.5B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_1_5b_instruct.py b/configs/models/qwen/vllm_qwen2_1_5b_instruct.py
new file mode 100644
index 00000000..fb173d31
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_1_5b_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2-1.5b-instruct-vllm',
+        path='Qwen/Qwen2-1.5B-Instruct',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_57b_a14b_instruct.py b/configs/models/qwen/vllm_qwen2_57b_a14b_instruct.py
new file mode 100644
index 00000000..6b586c74
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_57b_a14b_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2-57b-a14b-instruct-vllm',
+        path='Qwen/Qwen2-57B-A14B-Instruct',
+        model_kwargs=dict(tensor_parallel_size=2),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_72b.py b/configs/models/qwen/vllm_qwen2_72b.py
new file mode 100644
index 00000000..f2f0b709
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_72b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen2-72b-vllm',
+        path='Qwen/Qwen2-72B',
+        model_kwargs=dict(tensor_parallel_size=4),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_72b_instruct.py b/configs/models/qwen/vllm_qwen2_72b_instruct.py
new file mode 100644
index 00000000..fd53bc44
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_72b_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2-72b-instruct-vllm',
+        path='Qwen/Qwen2-72B-Instruct',
+        model_kwargs=dict(tensor_parallel_size=4),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_7b.py b/configs/models/qwen/vllm_qwen2_7b.py
new file mode 100644
index 00000000..1ed1bb84
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen2-7b-vllm',
+        path='Qwen/Qwen2-7B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_7b_instruct.py b/configs/models/qwen/vllm_qwen2_7b_instruct.py
new file mode 100644
index 00000000..0c4b6255
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_7b_instruct.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2-7b-instruct-vllm',
+        path='Qwen/Qwen2-7B-Instruct',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen2_series.py b/configs/models/qwen/vllm_qwen2_series.py
new file mode 100644
index 00000000..b65d0993
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen2_series.py
@@ -0,0 +1,25 @@
+from opencompass.models import VLLM
+
+settings = [
+    ('qwen2-0.5b-vllm', 'Qwen/Qwen2-0.5B', 1),
+    ('qwen2-1.5b-vllm', 'Qwen/Qwen2-1.5B', 1),
+    ('qwen2-7b-vllm', 'Qwen/Qwen2-7B', 1),
+    ('qwen2-72b-vllm', 'Qwen/Qwen2-72B', 4),
+    ('qwen2-57b-a14b-vllm', 'Qwen/Qwen2-57B-A14B', 2),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus),
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/models/qwen/vllm_qwen_14b.py b/configs/models/qwen/vllm_qwen_14b.py
new file mode 100644
index 00000000..4c369c2e
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_14b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen-14b-vllm',
+        path='Qwen/Qwen-14B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_14b_chat.py b/configs/models/qwen/vllm_qwen_14b_chat.py
index 53e221aa..967d942b 100644
--- a/configs/models/qwen/vllm_qwen_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen_14b_chat.py
@@ -1,25 +1,14 @@
-from opencompass.models import VLLM
-
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import VLLMwithChatTemplate
 
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='qwen-14b-chat-vllm',
         path='Qwen/Qwen-14B-Chat',
-        model_kwargs=dict(tensor_parallel_size=4),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['<|im_end|>'],
-        run_cfg=dict(num_gpus=4, num_procs=1),
+        run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen_1_8b.py b/configs/models/qwen/vllm_qwen_1_8b.py
new file mode 100644
index 00000000..38f2fd8a
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen-1.8b-vllm',
+        path='Qwen/Qwen-1_8B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_1_8b_chat.py b/configs/models/qwen/vllm_qwen_1_8b_chat.py
new file mode 100644
index 00000000..f3bba883
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_1_8b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen-1.8b-chat-vllm',
+        path='Qwen/Qwen-1_8B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_72b.py b/configs/models/qwen/vllm_qwen_72b.py
new file mode 100644
index 00000000..5611703e
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_72b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen-72b-vllm',
+        path='Qwen/Qwen-72B',
+        model_kwargs=dict(tensor_parallel_size=4),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_72b_chat.py b/configs/models/qwen/vllm_qwen_72b_chat.py
index fb727913..f28353f9 100644
--- a/configs/models/qwen/vllm_qwen_72b_chat.py
+++ b/configs/models/qwen/vllm_qwen_72b_chat.py
@@ -1,25 +1,14 @@
-from opencompass.models import VLLM
-
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-    ],
-)
+from opencompass.models import VLLMwithChatTemplate
 
 models = [
     dict(
-        type=VLLM,
+        type=VLLMwithChatTemplate,
         abbr='qwen-72b-chat-vllm',
         path='Qwen/Qwen-72B-Chat',
         model_kwargs=dict(tensor_parallel_size=4),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=32,
+        max_out_len=1024,
+        batch_size=16,
         generation_kwargs=dict(temperature=0),
-        stop_words=['<|im_end|>'],
-        run_cfg=dict(num_gpus=4, num_procs=1),
+        run_cfg=dict(num_gpus=4),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen_7b.py b/configs/models/qwen/vllm_qwen_7b.py
new file mode 100644
index 00000000..7dae2619
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import VLLM
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='qwen-7b-vllm',
+        path='Qwen/Qwen-7B',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        max_seq_len=8192,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_7b_chat.py b/configs/models/qwen/vllm_qwen_7b_chat.py
new file mode 100644
index 00000000..55016ffe
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_7b_chat.py
@@ -0,0 +1,14 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen-7b-chat-vllm',
+        path='Qwen/Qwen-7B-Chat',
+        model_kwargs=dict(tensor_parallel_size=1),
+        max_out_len=1024,
+        batch_size=16,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen/vllm_qwen_series.py b/configs/models/qwen/vllm_qwen_series.py
new file mode 100644
index 00000000..f01ef5ab
--- /dev/null
+++ b/configs/models/qwen/vllm_qwen_series.py
@@ -0,0 +1,24 @@
+from opencompass.models import VLLM
+
+settings = [
+    ('qwen-1.8b-vllm', 'Qwen/Qwen-1_8B', 1),
+    ('qwen-7b-vllm', 'Qwen/Qwen-7B', 1),
+    ('qwen-14b-vllm', 'Qwen/Qwen-14B', 1),
+    ('qwen-72b-vllm', 'Qwen/Qwen-72B', 4),
+]
+
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus),
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
diff --git a/configs/summarizers/charm_reason.py b/configs/summarizers/charm_reason.py
index 8e12e901..0d86a846 100644
--- a/configs/summarizers/charm_reason.py
+++ b/configs/summarizers/charm_reason.py
@@ -1,6 +1,6 @@
-from mmengine.config import reasond_base
+from mmengine.config import read_base
 
-with reasond_base():
+with read_base():
     from .groups.charm_reason import charm_reason_summary_groups
 
 summarizer = dict(
diff --git a/configs/summarizers/compass_knowledge.py b/configs/summarizers/compass_knowledge.py
deleted file mode 100644
index 336ed37c..00000000
--- a/configs/summarizers/compass_knowledge.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
-compassbench_v1_knowledge_names = [
-    'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
-    'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
-    'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
-    'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
-]
-
-compassbench_v1_knowledge_groups = [
-    {'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
-    {'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
-    {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
-]
-
-summarizer = dict(
-    dataset_abbrs=[
-        'knowledge_perf_4_and_cloze',
-        ['knowledge_cn', 'perf_4'],
-        'compassbench_v1_knowledge-mixed-cloze_en',
-        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
-        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
-        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
-        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
-    ],
-    summary_groups=compassbench_v1_knowledge_groups
-)
diff --git a/configs/summarizers/compass_math.py b/configs/summarizers/compass_math.py
deleted file mode 100644
index 4ebc22b5..00000000
--- a/configs/summarizers/compass_math.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
-
-compassbench_v1_math_groups = [
-    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
-    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
-    {'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
-    {'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
-]
-
-
-summarizer = dict(
-    dataset_abbrs=[
-        'math_perf_4_and_fill_in_blank',
-        'math_perf_4_and_fill_in_blank_cn',
-        'math_perf_4_and_fill_in_blank_en',
-        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-    ],
-    summary_groups=compassbench_v1_math_groups,
-)
diff --git a/configs/summarizers/compassbench_v1_1_objective.py b/configs/summarizers/compassbench_v1_1_objective.py
new file mode 100644
index 00000000..14babf21
--- /dev/null
+++ b/configs/summarizers/compassbench_v1_1_objective.py
@@ -0,0 +1,244 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.legacy.cibench import cibench_summary_groups
+    from .groups.plugineval import plugineval_summary_groups
+
+
+compassbench_v1_language_names = [
+    # ['information_retrieval_en', 'score'],
+    # ['information_retrieval_zh', 'score'],
+    ['intention_recognition_en_circular', 'acc_origin'],
+    ['intention_recognition_en_circular', 'perf_circular'],
+    ['intention_recognition_zh_circular', 'acc_origin'],
+    ['intention_recognition_zh_circular', 'perf_circular'],
+    ['sentiment_analysis_en_circular', 'acc_origin'],
+    ['sentiment_analysis_en_circular', 'perf_circular'],
+    ['sentiment_analysis_zh_circular', 'acc_origin'],
+    ['sentiment_analysis_zh_circular', 'perf_circular'],
+    ['translation', 'score'],
+    ['content_critic_en_circular', 'acc_origin'],
+    ['content_critic_en_circular', 'perf_circular'],
+    ['content_critic_zh_circular', 'acc_origin'],
+    ['content_critic_zh_circular', 'perf_circular'],
+    ['content_summarization_en', 'rouge1'],
+    ['content_summarization_zh', 'rouge1'],
+    ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
+    ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
+    ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
+    ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
+]
+
+compassbench_v1_language_summary_groups = [
+    {'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
+    {'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
+    {'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
+
+    {'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
+    {'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
+    {'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
+]
+
+# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
+compassbench_v1_knowledge_names = [
+    'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
+    'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
+    'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
+    'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
+]
+
+compassbench_v1_knowledge_summary_groups = [
+    {'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
+    {'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
+    {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
+]
+
+compassbench_v1_reason_summary_groups = [
+    {'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']},
+    {'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']},
+    {'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']},
+    {'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
+    {'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
+
+    {'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']},
+    {'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']},
+    {'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']},
+]
+
+compassbench_v1_math_summary_groups = [
+    # A & T
+    # {'name': 'mathbench-arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]},
+    # {'name': 'mathbench-primary_cn', 'subsets': [['mathbench-primary_knowledge-single_choice_cn', 'perf_4'], ['mathbench-primary-cloze_cn', 'accuracy']]},
+    # {'name': 'mathbench-primary_en', 'subsets': [['mathbench-primary_knowledge-single_choice_en', 'perf_4'], ['mathbench-primary-cloze_en', 'accuracy']]},
+    # {'name': 'mathbench-middle_cn', 'subsets': [['mathbench-middle_knowledge-single_choice_cn', 'perf_4'], ['mathbench-middle-single_choice_cn', 'perf_4']]},
+    # {'name': 'mathbench-middle_en', 'subsets': [['mathbench-middle_knowledge-single_choice_en', 'perf_4'], ['mathbench-middle-single_choice_en', 'perf_4']]},
+    # {'name': 'mathbench-high_cn', 'subsets': [['mathbench-high_knowledge-single_choice_cn', 'perf_4'], ['mathbench-high-single_choice_cn', 'perf_4']]},
+    # {'name': 'mathbench-high_en', 'subsets': [['mathbench-high_knowledge-single_choice_en', 'perf_4'], ['mathbench-high-single_choice_en', 'perf_4']]},
+    # {'name': 'mathbench-college_cn', 'subsets': [['mathbench-college_knowledge-single_choice_cn', 'perf_4'], ['mathbench-college-single_choice_cn', 'perf_4']]},
+    # {'name': 'mathbench-college_en', 'subsets': [['mathbench-college_knowledge-single_choice_en', 'perf_4'], ['mathbench-college-single_choice_en', 'perf_4']]},
+    # {'name': 'mathbench_cn', 'subsets': ['mathbench-arithmetic', 'mathbench-primary_cn', 'mathbench-middle_cn', 'mathbench-high_cn', 'mathbench-college_cn']},
+    # {'name': 'mathbench_en', 'subsets': ['mathbench-arithmetic', 'mathbench-primary_en', 'mathbench-middle_en', 'mathbench-high_en', 'mathbench-college_en']},
+    # {'name': 'mathbench', 'subsets': ['mathbench_cn', 'mathbench_en']},
+    # A Only
+    {'name': 'mathbench-arithmetic', 'subsets': [['mathbench-arithmetic-cloze_en', 'accuracy']]},
+    {'name': 'mathbench-primary_cn', 'subsets': [['mathbench-primary-cloze_cn', 'accuracy']]},
+    {'name': 'mathbench-primary_en', 'subsets': [['mathbench-primary-cloze_en', 'accuracy']]},
+    {'name': 'mathbench-middle_cn', 'subsets': [['mathbench-middle-single_choice_cn', 'perf_4']]},
+    {'name': 'mathbench-middle_en', 'subsets': [['mathbench-middle-single_choice_en', 'perf_4']]},
+    {'name': 'mathbench-high_cn', 'subsets': [['mathbench-high-single_choice_cn', 'perf_4']]},
+    {'name': 'mathbench-high_en', 'subsets': [['mathbench-high-single_choice_en', 'perf_4']]},
+    {'name': 'mathbench-college_cn', 'subsets': [['mathbench-college-single_choice_cn', 'perf_4']]},
+    {'name': 'mathbench-college_en', 'subsets': [['mathbench-college-single_choice_en', 'perf_4']]},
+    {'name': 'mathbench_cn', 'subsets': ['mathbench-arithmetic', 'mathbench-primary_cn', 'mathbench-middle_cn', 'mathbench-high_cn', 'mathbench-college_cn']},
+    {'name': 'mathbench_en', 'subsets': ['mathbench-arithmetic', 'mathbench-primary_en', 'mathbench-middle_en', 'mathbench-high_en', 'mathbench-college_en']},
+    {'name': 'mathbench', 'subsets': ['mathbench_cn', 'mathbench_en']},
+]
+
+
+code_passk_summary_groups = [
+    # rename
+    {'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
+    {'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
+    {'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
+    {'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
+    # real add
+    {'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
+    {'name': 'lcbench_cn', 'subsets': ['lcbench_cn-EASY', 'lcbench_cn-MEDIUM', 'lcbench_cn-HARD']},
+    {'name': 'lcbench_en', 'subsets': ['lcbench_en-EASY', 'lcbench_en-MEDIUM', 'lcbench_en-HARD']},
+    {'name': 'TACO', 'subsets': ['TACO-EASY', 'TACO-MEDIUM', 'TACO-MEDIUM_HARD', 'TACO-HARD', 'TACO-VERY_HARD']},
+    {'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'lcbench_cn']},
+    {'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx', 'lcbench_en', 'TACO']},
+    {'name': 'code', 'subsets': [['humaneval_cn_pass@1(greedy)', 'naive_average'], ['mbpp_cn_pass@1(greedy)', 'naive_average'], ['humaneval_plus_pass@1(greedy)', 'naive_average'], ['sanitized_mbpp_pass@1(greedy)', 'naive_average'], ['humanevalx', 'naive_average'], ['lcbench_cn', 'pass@1'], ['lcbench_en', 'pass@1'], ['TACO', 'naive_average']]},
+]
+
+agent_summary_groups = [
+    # dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
+    # dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
+
+    dict(name='cibench_template', subsets=['cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim']),
+    dict(name='cibench_template_cn', subsets=['cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim']),
+
+    dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
+    dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
+    dict(name='agent', subsets=['agent_cn', 'agent_en']),
+]
+
+other_summary_groups = [
+    {
+        'name': 'average_cn',
+        'subsets': [
+            ['language_zh_perf_4_and_non_mcq', 'naive_average'],
+            ['knowledge_cn', 'perf_4'],
+            ['reasonbench_cn_circular', 'perf_circular'],
+            ['math_perf_4_and_fill_in_blank_cn', 'naive_average'],
+            ['code_cn', 'naive_average'],
+            ['agent_cn', 'naive_average'],
+        ],
+    },
+    {
+        'name': 'average_en',
+        'subsets': [
+            ['language_en_perf_4_and_non_mcq', 'naive_average'],
+            ['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
+            ['reasonbench_en_circular', 'perf_circular'],
+            ['math_perf_4_and_fill_in_blank_en', 'naive_average'],
+            ['code_en', 'naive_average'],
+            ['agent_en', 'naive_average'],
+        ],
+    },
+    {
+        'name': 'average',
+        'subsets': [
+            ['language_perf_4_and_non_mcq', 'naive_average'],
+            ['knowledge_perf_4_and_cloze', 'naive_average'],
+            ['reasonbench', 'perf_circular'],
+            ['math_perf_4_and_fill_in_blank', 'naive_average'],
+            ['code', 'naive_average'],
+            ['agent', 'naive_average'],
+        ],
+    },
+]
+
+
+
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['average', 'naive_average'],
+        ['average_cn', 'naive_average'],
+        ['average_en', 'naive_average'],
+        '',
+        '',
+        '',
+
+        ['language_perf_4_and_non_mcq', 'naive_average'],
+        ['language_zh_perf_4_and_non_mcq', 'naive_average'],
+        ['language_en_perf_4_and_non_mcq', 'naive_average'],
+        ['intention_recognition_zh_circular', 'perf_circular'],
+        ['intention_recognition_en_circular', 'perf_circular'],
+        ['sentiment_analysis_zh_circular', 'perf_circular'],
+        ['sentiment_analysis_en_circular', 'perf_circular'],
+        ['translation', 'score'],
+        ['content_critic_zh_circular', 'perf_circular'],
+        ['content_critic_en_circular', 'perf_circular'],
+        ['content_summarization_zh', 'rouge1'],
+        ['content_summarization_en', 'rouge1'],
+        ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
+        ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
+
+        ['knowledge_perf_4_and_cloze', 'naive_average'],
+        ['knowledge_cn', 'perf_4'],
+        ['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
+        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
+
+        ['reasonbench', 'perf_circular'],
+        ['reasonbench_cn_circular', 'perf_circular'],
+        ['reasonbench_en_circular', 'perf_circular'],
+        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
+        ['reasonbench_cn_abductive_circular', 'perf_circular'],
+        ['reasonbench_cn_deductive_circular', 'perf_circular'],
+        ['reasonbench_cn_inductive_circular', 'perf_circular'],
+        ['reasonbench_en_commonsense_circular', 'perf_circular'],
+        ['reasonbench_en_abductive_circular', 'perf_circular'],
+        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
+        ['reasonbench_en_inductive_circular', 'perf_circular'],
+
+        ['mathbench', 'naive_average'],
+        ['mathbench_cn', 'naive_average'],
+        ['mathbench_en', 'naive_average'],
+        ['mathbench-arithmetic', 'naive_average'],
+        ['mathbench-primary_cn', 'naive_average'],
+        ['mathbench-primary_en', 'naive_average'],
+        ['mathbench-middle_cn', 'naive_average'],
+        ['mathbench-middle_en', 'naive_average'],
+        ['mathbench-high_cn', 'naive_average'],
+        ['mathbench-high_en', 'naive_average'],
+        ['mathbench-college_cn', 'naive_average'],
+        ['mathbench-college_en', 'naive_average'],
+
+        ['code', 'naive_average'],
+        ['code_cn', 'naive_average'],
+        ['code_en', 'naive_average'],
+        ['humaneval_cn_pass@1(greedy)', 'naive_average'],
+        ['humaneval_plus_pass@1(greedy)', 'naive_average'],
+        ['mbpp_cn_pass@1(greedy)', 'naive_average'],
+        ['sanitized_mbpp_pass@1(greedy)', 'naive_average'],
+        ['humanevalx', 'naive_average'],
+        ['lcbench_cn', 'pass@1'],
+        ['lcbench_en', 'pass@1'],
+        ['TACO', 'naive_average'],
+
+        ['agent', 'naive_average'],
+        ['agent_cn', 'naive_average'],
+        ['agent_en', 'naive_average'],
+        ['cibench_template_cn', 'naive_average'],
+        ['cibench_template', 'naive_average'],
+        ['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
+        ['plugin_eval-mus-p10_one_review', 'naive_average'],
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
diff --git a/configs/summarizers/compassbench_v1_1_objective_public.py b/configs/summarizers/compassbench_v1_1_objective_public.py
new file mode 100644
index 00000000..493c9ecb
--- /dev/null
+++ b/configs/summarizers/compassbench_v1_1_objective_public.py
@@ -0,0 +1,22 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from .compassbench_v1_1_objective import summarizer
+
+for dataset_abbr in summarizer['dataset_abbrs']:
+    if isinstance(dataset_abbr, str):
+        continue
+    else:
+        dataset_abbr[0] = dataset_abbr[0] + '_public'
+for summary_group in summarizer['summary_groups']:
+    summary_group['name'] = summary_group['name'] + '_public'
+    replaced_subset = []
+    for subset in summary_group['subsets']:
+        if isinstance(subset, str):
+            replaced_subset.append(subset + '_public')
+        else:
+            replaced_subset.append([subset[0] + '_public', subset[1]])
+    summary_group['subsets'] = replaced_subset
+    if 'weights' in summary_group:
+        summary_group['weights'] = {k + '_public': v for k, v in summary_group['weights'].items()}
diff --git a/configs/summarizers/compassbench_v1_language.py b/configs/summarizers/compassbench_v1_language.py
deleted file mode 100644
index 2d619e31..00000000
--- a/configs/summarizers/compassbench_v1_language.py
+++ /dev/null
@@ -1,53 +0,0 @@
-compassbench_v1_language_names = [
-    # ['information_retrieval_en', 'score'],
-    # ['information_retrieval_zh', 'score'],
-    ['intention_recognition_en_circular', 'acc_origin'],
-    ['intention_recognition_en_circular', 'perf_circular'],
-    ['intention_recognition_zh_circular', 'acc_origin'],
-    ['intention_recognition_zh_circular', 'perf_circular'],
-    ['sentiment_analysis_en_circular', 'acc_origin'],
-    ['sentiment_analysis_en_circular', 'perf_circular'],
-    ['sentiment_analysis_zh_circular', 'acc_origin'],
-    ['sentiment_analysis_zh_circular', 'perf_circular'],
-    ['translation', 'score'],
-    ['content_critic_en_circular', 'acc_origin'],
-    ['content_critic_en_circular', 'perf_circular'],
-    ['content_critic_zh_circular', 'acc_origin'],
-    ['content_critic_zh_circular', 'perf_circular'],
-    ['content_summarization_en', 'rouge1'],
-    ['content_summarization_zh', 'rouge1'],
-    ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
-    ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
-    ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
-    ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
-]
-
-compassbench_v1_language_groups = [
-    {'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
-    {'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
-    {'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
-
-    {'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
-    {'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
-    {'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
-]
-
-summarizer = dict(
-    dataset_abbrs=[
-        'language_perf_4_and_non_mcq',
-        'language_zh_perf_4_and_non_mcq',
-        'language_en_perf_4_and_non_mcq',
-        ['intention_recognition_zh_circular', 'perf_circular'],
-        ['intention_recognition_en_circular', 'perf_circular'],
-        ['sentiment_analysis_zh_circular', 'perf_circular'],
-        ['sentiment_analysis_en_circular', 'perf_circular'],
-        ['translation', 'score'],
-        ['content_critic_zh_circular', 'perf_circular'],
-        ['content_critic_en_circular', 'perf_circular'],
-        ['content_summarization_zh', 'rouge1'],
-        ['content_summarization_en', 'rouge1'],
-        ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
-        ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
-    ],
-    summary_groups=compassbench_v1_language_groups,
-)
diff --git a/configs/summarizers/compassbench_v1_reason.py b/configs/summarizers/compassbench_v1_reason.py
deleted file mode 100644
index ae855f70..00000000
--- a/configs/summarizers/compassbench_v1_reason.py
+++ /dev/null
@@ -1,28 +0,0 @@
-compassbench_v1_reason_groups = [
-    {'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']},
-    {'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']},
-    {'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']},
-    {'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
-    {'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
-
-    {'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']},
-    {'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']},
-    {'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']},
-]
-
-summarizer = dict(
-    dataset_abbrs=[
-        ['reasonbench', 'perf_circular'],
-        ['reasonbench_cn_circular', 'perf_circular'],
-        ['reasonbench_en_circular', 'perf_circular'],
-        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
-        ['reasonbench_cn_abductive_circular', 'perf_circular'],
-        ['reasonbench_cn_deductive_circular', 'perf_circular'],
-        ['reasonbench_cn_inductive_circular', 'perf_circular'],
-        ['reasonbench_en_commonsense_circular', 'perf_circular'],
-        ['reasonbench_en_abductive_circular', 'perf_circular'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
-        ['reasonbench_en_inductive_circular', 'perf_circular'],
-    ],
-    summary_groups=compassbench_v1_reason_groups,
-)
diff --git a/configs/summarizers/groups/mmlu_pro.py b/configs/summarizers/groups/mmlu_pro.py
new file mode 100644
index 00000000..ba2d9fc7
--- /dev/null
+++ b/configs/summarizers/groups/mmlu_pro.py
@@ -0,0 +1,5 @@
+categories = ['math', 'physics', 'chemistry', 'law', 'engineering', 'other', 'economics', 'health', 'psychology', 'business', 'biology', 'philosophy', 'computer science', 'history']
+
+mmlu_pro_summary_groups = [
+    {'name': 'mmlu_pro', 'subsets': ['mmlu_pro_' + c.replace(' ', '_') for c in categories]},
+]
diff --git a/configs/summarizers/mmlu_pro.py b/configs/summarizers/mmlu_pro.py
new file mode 100644
index 00000000..34784875
--- /dev/null
+++ b/configs/summarizers/mmlu_pro.py
@@ -0,0 +1,25 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.mmlu_pro import mmlu_pro_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
diff --git a/docs/en/user_guides/metrics.md b/docs/en/user_guides/metrics.md
index 265c695a..ebf77373 100644
--- a/docs/en/user_guides/metrics.md
+++ b/docs/en/user_guides/metrics.md
@@ -6,7 +6,7 @@ In the evaluation phase, we typically select the corresponding evaluation metric
 - **Phrase**: Common in Q&A and reading comprehension tasks. This type of dataset mainly includes CLUE_CMRC, CLUE_DRCD, DROP datasets, etc. Matching rate is usually used as the evaluation standard--`EMEvaluator`.
 - **Sentence**: Common in translation and generating pseudocode/command-line tasks, mainly including Flores, Summscreen, Govrepcrs, Iwdlt2017 datasets, etc. BLEU (Bilingual Evaluation Understudy) is usually used as the evaluation standard--`BleuEvaluator`.
 - **Paragraph**: Common in text summary generation tasks, commonly used datasets mainly include Lcsts, TruthfulQA, Xsum datasets, etc. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is usually used as the evaluation standard--`RougeEvaluator`.
-- **Code**: Common in code generation tasks, commonly used datasets mainly include Humaneval, MBPP datasets, etc. Execution pass rate and `pass@k` are usually used as the evaluation standard. At present, Opencompass supports `MBPPEvaluator` and `HumanEvaluator`.
+- **Code**: Common in code generation tasks, commonly used datasets mainly include Humaneval, MBPP datasets, etc. Execution pass rate and `pass@k` are usually used as the evaluation standard. At present, Opencompass supports `MBPPEvaluator` and `HumanEvalEvaluator`.
 
 There is also a type of **scoring-type** evaluation task without standard answers, such as judging whether the output of a model is toxic, which can directly use the related API service for scoring. At present, it supports `ToxicEvaluator`, and currently, the realtoxicityprompts dataset uses this evaluation method.
 
@@ -21,7 +21,7 @@ Currently, in OpenCompass, commonly used Evaluators are mainly located in the [`
 | `BleuEvaluator`       | BLEU                 | None, `flores`               | flores, iwslt2017, summscreen, govrepcrs                             |
 | `RougeEvaluator`      | ROUGE                | None, dataset-specific       | truthfulqa, Xsum, XLSum                                              |
 | `JiebaRougeEvaluator` | ROUGE                | None, dataset-specific       | lcsts                                                                |
-| `HumanEvaluator`      | pass@k               | `humaneval_postprocess`      | humaneval_postprocess                                                |
+| `HumanEvalEvaluator`  | pass@k               | `humaneval_postprocess`      | humaneval_postprocess                                                |
 | `MBPPEvaluator`       | Execution Pass Rate  | None                         | mbpp                                                                 |
 | `ToxicEvaluator`      | PerspectiveAPI       | None                         | realtoxicityprompts                                                  |
 | `AGIEvalEvaluator`    | Accuracy             | None                         | agieval                                                              |
diff --git a/docs/zh_cn/user_guides/metrics.md b/docs/zh_cn/user_guides/metrics.md
index fab43237..e033abb6 100644
--- a/docs/zh_cn/user_guides/metrics.md
+++ b/docs/zh_cn/user_guides/metrics.md
@@ -6,7 +6,7 @@
 - **短语**：常见于问答以及阅读理解任务，这类数据集主要包括 CLUE_CMRC, CLUE_DRCD, DROP 数据集等等，评估标准一般使用匹配率--`EMEvaluator`。
 - **句子**：常见于翻译以及生成伪代码、命令行任务中，主要包括 Flores, Summscreen, Govrepcrs, Iwdlt2017 数据集等等，评估标准一般使用 BLEU(Bilingual Evaluation Understudy)--`BleuEvaluator`。
 - **段落**：常见于文本摘要生成的任务，常用的数据集主要包括 Lcsts, TruthfulQA, Xsum 数据集等等，评估标准一般使用 ROUGE（Recall-Oriented Understudy for Gisting Evaluation）--`RougeEvaluator`。
-- **代码**：常见于代码生成的任务，常用的数据集主要包括 Humaneval，MBPP 数据集等等，评估标准一般使用执行通过率以及 `pass@k`，目前 Opencompass 支持的有`MBPPEvaluator`、`HumanEvaluator`。
+- **代码**：常见于代码生成的任务，常用的数据集主要包括 Humaneval，MBPP 数据集等等，评估标准一般使用执行通过率以及 `pass@k`，目前 Opencompass 支持的有`MBPPEvaluator`、`HumanEvalEvaluator`。
 
 还有一类**打分类型**评测任务没有标准答案，比如评判一个模型的输出是否存在有毒，可以直接使用相关 API 服务进行打分，目前支持的有 `ToxicEvaluator`，目前有 realtoxicityprompts 数据集使用此评测方式。
 
@@ -21,7 +21,7 @@
 | `BleuEvaluator`       | BLEU                 | None, `flores`              | flores, iwslt2017, summscreen, govrepcrs                             |
 | `RougeEvaluator`      | ROUGE                | None, dataset_specification | truthfulqa, Xsum, XLSum                                              |
 | `JiebaRougeEvaluator` | ROUGE                | None, dataset_specification | lcsts                                                                |
-| `HumanEvaluator`      | pass@k               | `humaneval_postprocess`     | humaneval_postprocess                                                |
+| `HumanEvalEvaluator`  | pass@k               | `humaneval_postprocess`     | humaneval_postprocess                                                |
 | `MBPPEvaluator`       | 执行通过率           | None                        | mbpp                                                                 |
 | `ToxicEvaluator`      | PerspectiveAPI       | None                        | realtoxicityprompts                                                  |
 | `AGIEvalEvaluator`    | 正确率               | None                        | agieval                                                              |
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index f0aaa5f4..0497e139 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -242,11 +242,14 @@ def main():
 
     # update "actual" work_dir
     cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
+    current_workdir = cfg['work_dir']
+    logger.info(f'Current exp folder: {current_workdir}')
+
     os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
 
     # dump config
     output_config_path = osp.join(cfg.work_dir, 'configs',
-                                  f'{cfg_time_str}.py')
+                                  f'{cfg_time_str}_{os.getpid()}.py')
     cfg.dump(output_config_path)
     # Config is intentally reloaded here to avoid initialized
     # types cannot be serialized
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index f89dfc0a..f08f6844 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -74,6 +74,7 @@ from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
+from .mmlu_pro import *  # noqa: F401, F403
 from .MMLUArabic import *  # noqa: F401, F403
 from .multirc import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
diff --git a/opencompass/datasets/benbench.py b/opencompass/datasets/benbench.py
new file mode 100644
index 00000000..39bd8032
--- /dev/null
+++ b/opencompass/datasets/benbench.py
@@ -0,0 +1,88 @@
+# flake8: noqa
+# yapf: disable
+import json
+from typing import Dict, Optional
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class BenBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, tokenizer_path: str, tokenizer_kwargs: Optional[Dict] = dict(), num_gram: int=5, num_replica: int=5):
+        import numpy as np
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, **tokenizer_kwargs)
+        data = []
+        with open(path, encoding='utf-8') as f:
+            for index, line in enumerate(f):
+                line = json.loads(line)
+                if 'rewritten' in path:
+                    text = line['rewritten_question'] + ' ' + line['rewritten_answer']
+                elif 'origin' in path:
+                    text = line['question'] + ' ' + line['answer']
+                else:
+                    raise ValueError(f'Unknown file type: {path}')
+                tokens = tokenizer.encode(text, add_special_tokens=False)
+                if len(tokens) >= num_gram + max(num_replica, 2):
+                    starting_points = np.linspace(2, len(tokens) - num_gram, num=num_replica, endpoint=True, dtype=int).tolist()
+                else:
+                    starting_points = np.linspace(2, max(2, len(tokens)), num=num_replica, endpoint=True, dtype=int).tolist()
+                for s in starting_points:
+                    data.append({
+                        'index': index,
+                        'prompt': tokenizer.decode(tokens[:s]),
+                        'reference': tokenizer.decode(tokens[s:s+num_gram])
+                    })
+        dataset = Dataset.from_list(data)
+        return dataset
+
+def exact_match_score(predicted_text, original_text):
+    return predicted_text == original_text
+
+def edit_similarity_score(predicted_text, original_text):
+    # Calculate normalized edit distance
+    import editdistance
+
+    edit_dist = editdistance.eval(predicted_text, original_text)
+    max_length = max(len(predicted_text), len(original_text), 1)
+    edit_similarity = 1 - (edit_dist / max_length)
+    return edit_similarity
+
+def rouge_l_score(predicted_text, original_text):
+    # Calculate Rouge-L score
+    from rouge_score import rouge_scorer
+
+    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+    rouge_score = scorer.score(original_text, predicted_text)['rougeL'].fmeasure
+    return rouge_score
+
+@ICL_EVALUATORS.register_module()
+class BenbenEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {'error': 'pred and refr length mismatch'}
+
+        valid_exact_match, valid_edit_similarity, valid_rouge_score = 0, 0, 0
+        total = len(predictions)
+        for pred, ref in zip(predictions, references):
+            exact_match = exact_match_score(pred, ref)
+            edit_similarity = edit_similarity_score(pred, ref)
+            rougeL = rouge_l_score(pred, ref)
+
+            valid_exact_match += exact_match
+            valid_edit_similarity += edit_similarity > 0.75
+            valid_rouge_score += rougeL > 0.75
+
+        return {
+            'exact_match': valid_exact_match / total * 100,
+            'edit_similarity': valid_edit_similarity / total * 100,
+            'rougeL': valid_rouge_score / total * 100,
+        }
diff --git a/opencompass/datasets/charm.py b/opencompass/datasets/charm.py
index 474eb139..652b0842 100644
--- a/opencompass/datasets/charm.py
+++ b/opencompass/datasets/charm.py
@@ -49,7 +49,7 @@ class CharmDataset(BaseDataset):
 
     @staticmethod
     def load(path: str, name: str):
-        with open(osp.join(path, f'{name}.json'), 'r') as f:
+        with open(osp.join(path, f'{name}.json'), 'r', encoding='utf-8') as f:
             data = json.load(f)['examples']
         dataset = Dataset.from_list(data)
         return dataset
diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py
index 4bf23f18..2002e469 100644
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -1,3 +1,6 @@
+# flake8: noqa: E501
+# yapf: disable
+import copy
 import json
 import os.path as osp
 import re
@@ -11,6 +14,17 @@ from opencompass.registry import LOAD_DATASET
 
 from .base import BaseDataset
 
+HUMANEVAL_IMPORT_ERROR = '''\
+Please install human_eval use following steps:
+git clone git@github.com:open-compass/human-eval.git
+cd human-eval && pip install -e .'''
+
+HUMANEVAL_PLUS_IMPORT_ERROR = '''\
+Please install evalplus use following steps:
+git clone --recurse-submodules git@github.com:open-compass/human-eval.git
+cd human-eval
+pip install -e .
+pip install -e evalplus'''
 
 @LOAD_DATASET.register_module()
 class HumanevalDataset(BaseDataset):
@@ -35,199 +49,125 @@ class HumanevalDataset(BaseDataset):
         dataset = []
         with open(path, 'r', encoding='utf-8') as f:
             for line in f:
-                dataset.extend(
-                    [json.loads(line.strip()) for _ in range(num_repeats)])
+                line = json.loads(line)
+                dataset.extend([copy.deepcopy(line) for _ in range(num_repeats)])
         return Dataset.from_list(dataset)
 
 
-class HumanEvaluator(BaseEvaluator):
+class HumanEvalEvaluator(BaseEvaluator):
     """Evaluator for HumanEval or EvalPlus."""
 
-    def __init__(self,
-                 k: List[int] = [1, 10, 100],
-                 metric: str = 'HumanEval') -> None:
-        self.metric = metric
-        assert self.metric in ['HumanEval', 'EvalPlus']
-        if self.metric == 'HumanEval':
-            try:
-                from human_eval.data import HUMAN_EVAL, write_jsonl
-                from human_eval.evaluation import \
-                    evaluate_functional_correctness
-                self.write_jsonl = write_jsonl
-                self.HUMAN_EVAL = HUMAN_EVAL
-                self.eval = evaluate_functional_correctness
-            except ImportError:
-                raise ImportError(
-                    'Please install human_eval use following steps:\n'
-                    'git clone git@github.com:open-compass/human-eval.git\n'
-                    'cd human-eval && pip install -e .')
-        else:
-            try:
-                from evalplus.data import write_jsonl
-                from evalplus.evaluate import evaluate
-                self.write_jsonl = write_jsonl
-                self.eval = evaluate
-            except ImportError:
-                raise ImportError(
-                    'Please install evalplus use following steps:\n'
-                    'git clone --recurse-submodules git@github.com:open-compass/human-eval.git\n'  # noqa
-                    'cd human-eval\n'
-                    'pip install -e .\n'
-                    'pip install -e evalplus\n')
+    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
+        try:
+            import human_eval
+        except ImportError:
+            raise ImportError(HUMANEVAL_IMPORT_ERROR)
+
         self.k = k
         super().__init__()
 
     def score(self, predictions, references, test_set):
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+
+        from human_eval.data import HUMAN_EVAL, write_jsonl
+        from human_eval.evaluation import evaluate_functional_correctness
+
         prompts = [item['prompt'] for item in test_set]
         humaneval_preds = []
-        if self.metric == 'HumanEval':
-            # create json file in human_eval format
-            for preds, refer in zip(predictions, references):
-                # suits for two case
-                # 1. use repeated dataset
-                # 2. use `num_return_sequences` to generate multiple responses
-                if not isinstance(preds, list):
-                    preds = [preds]
-                for pred in preds:
-                    humaneval_preds.append({
-                        'task_id': refer,
-                        'completion': pred
-                    })
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                out_dir = osp.join(tmp_dir, 'human_eval.json')
-                self.write_jsonl(out_dir, humaneval_preds)
-                score = self.eval(out_dir,
-                                  self.k,
-                                  n_workers=4,
-                                  timeout=3.0,
-                                  problem_file=self.HUMAN_EVAL)
-                return {f'humaneval_{k}': score[k] * 100 for k in score}
-        else:
-            for preds, refer, prompt in zip(predictions, references, prompts):
-                if not isinstance(preds, list):
-                    preds = [preds]
-                for pred in preds:
-                    humaneval_preds.append({
-                        'task_id': refer,
-                        'solution': prompt + pred
-                    })
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                out_dir = osp.join(tmp_dir, 'human_eval.jsonl')
-                self.write_jsonl(out_dir, humaneval_preds)
-                flags = dict(dataset='humaneval',
-                             samples=out_dir,
-                             base_only=None,
-                             parallel=None,
-                             i_just_wanna_run=None,
-                             test_details=0.2,
-                             min_time_limit=0.2,
-                             gt_time_limit_factor=4.0,
-                             mini=None)
-                score = self.eval(flags)
-                return {f'humaneval_plus_{k}': score[k] * 100 for k in score}
+        # create json file in human_eval format
+        for preds, refer in zip(predictions, references):
+            # suits for two case
+            # 1. use repeated dataset
+            # 2. use `num_return_sequences` to generate multiple responses
+            if not isinstance(preds, list):
+                preds = [preds]
+            for pred in preds:
+                humaneval_preds.append({'task_id': refer, 'completion': pred})
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            out_dir = osp.join(tmp_dir, 'human_eval.json')
+            write_jsonl(out_dir, humaneval_preds)
+            score = evaluate_functional_correctness(out_dir, self.k, n_workers=4, timeout=3.0, problem_file=HUMAN_EVAL)
+
+            detail_path = osp.join(tmp_dir, 'human_eval.json_results.jsonl')
+            details = {}
+            with open(detail_path, 'r') as f:
+                for index, line in enumerate(f):
+                    line = json.loads(line)
+                    line['is_correct'] = line['passed']
+                    line['prompt'] = prompts[index]
+                    details[str(index)] = line
+
+        results = {f'humaneval_{k}': score[k] * 100 for k in score}
+        results['details'] = details
+        return results
 
 
-def humaneval_postprocess(text: str) -> str:
-    if '```' in text:
-        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-        if len(blocks) == 0:
-            text = text.split('```')[1]  # fall back to default strategy
-        else:
-            text = blocks[0]  # fetch the first code block
-            if not text.startswith('\n'):  # in case starting with ```python
-                text = text[max(text.find('\n') + 1, 0):]
-    if text.strip().startswith('from') or text.strip().startswith('import'):
-        def_idx = text.find('def')
-        if def_idx != -1:
-            text = text[max(text.find('\n', def_idx) + 1, 0):]
-    text = text.split('\n\n')[0]
-    text = text.lstrip('\n')
-    if text.strip().startswith('def'):
-        text = '\n'.join(text.split('\n')[1:])
-    if not text.startswith('    '):
-        if text.startswith(' '):
-            text = '    ' + text.lstrip()
-        else:
-            text = '\n'.join(['    ' + line for line in text.split('\n')])
-    return text
+class HumanEvalPlusEvaluator(BaseEvaluator):
+    """Evaluator for HumanEval or EvalPlus."""
+
+    def __init__(self, k: List[int] = [1, 10, 100]) -> None:
+        try:
+            import evalplus
+        except ImportError:
+            raise ImportError(HUMANEVAL_PLUS_IMPORT_ERROR)
+
+        self.k = k
+        super().__init__()
+
+    def score(self, predictions, references, test_set):
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+
+        from evalplus.data import write_jsonl
+        from evalplus.evaluate import evaluate
+
+        prompts = [item['prompt'] for item in test_set]
+        humaneval_preds = []
+        for preds, refer, prompt in zip(predictions, references, prompts):
+            if not isinstance(preds, list):
+                preds = [preds]
+            for pred in preds:
+                humaneval_preds.append({'task_id': refer, 'solution': prompt + pred})
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            out_dir = osp.join(tmp_dir, 'human_eval.jsonl')
+            write_jsonl(out_dir, humaneval_preds)
+            flags = dict(
+                dataset='humaneval',
+                samples=out_dir,
+                base_only=None,
+                parallel=None,
+                i_just_wanna_run=None,
+                test_details=0.2,
+                min_time_limit=0.2,
+                gt_time_limit_factor=4.0,
+                mini=None,
+            )
+            score = evaluate(flags)
+            results_path = osp.join(tmp_dir, 'human_eval_eval_results.json')
+            with open(results_path, 'r') as f:
+                results = json.load(f)
+            details = {}
+            for index in range(len(predictions)):
+                r = results['eval'][references[index]]
+
+                details[str(index)] = {
+                    'prompt': prompts[index],
+                    'prediction': predictions[index],
+                    'reference': references[index],
+                    'base_result': r['base'][0][0],
+                    'plus_result': r['plus'][0][0],
+                    'is_correct': r['base'][0][0] == 'success' and r['plus'][0][0] == 'success',
+                }
+                if r['nfiles'] > 1:
+                    details[str(index)]['warning'] = 'Multiple files in the solution. Details may be wrong.'
+        results = {f'humaneval_plus_{k}': score[k] * 100 for k in score}
+        results['details'] = details
+        return results
 
 
 def humaneval_postprocess_v2(text: str) -> str:
-    """This is an advanced version of previous postprocess to handle more
-    situations, better to use this one."""
-    try:
-        # for chatGLM related text
-        eval_text = eval(text)
-    except Exception:
-        pass
-    else:
-        if isinstance(eval_text, str):
-            text = eval_text
-    text = text.lstrip('\n')
-    if '```' in text:
-        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-        if len(blocks) == 0:
-            text = text.split('```')[1]  # fall back to default strategy
-        else:
-            text = blocks[0]  # fetch the first code block
-            if not text.startswith('\n'):  # in case starting with ```python
-                text = text[max(text.find('\n') + 1, 0):]
-    if text.strip().startswith('from') or text.strip().startswith('import'):
-        def_idx = text.find('def')
-        if def_idx != -1:
-            text = text[max(text.find('\n', def_idx) + 1, 0):]
-    # remove empty lines
-    text = '\n'.join([line for line in text.split('\n') if line != ''])
-    text = text.lstrip('\n')
-    if text.strip().startswith('def'):
-        text = '\n'.join(text.split('\n')[1:])
-    # deal with the indentation error
-    if text.startswith(' '):
-        text = '    ' + text.lstrip()
-    else:
-        text = '\n'.join(['    ' + line for line in text.split('\n')])
-    text = text.split('\n')
-
-    # If number of leading space reduces, we assume that the code block ends.
-    min_leading_space = None
-    end_index = None
-    for index, line in enumerate(text):
-        if line.strip() == '' or line.strip()[0] in ["'", '"', '#']:
-            continue
-        current_leading_space = len(line.rstrip()) - len(line.strip())
-        if min_leading_space is None:
-            min_leading_space = current_leading_space
-        elif current_leading_space < min_leading_space:
-            end_index = index
-            break
-    if end_index is not None:
-        text = '\n'.join(text[:end_index])
-    else:
-        text = '\n'.join(text)
-    return text
-
-
-def humaneval_gpt_postprocess(text: str) -> str:
-    """Better answer postprocessor for better instruction-aligned models like
-    GPT."""
-    if '```' in text:
-        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-        if len(blocks) == 0:
-            text = text.split('```')[1]  # fall back to default strategy
-        else:
-            text = blocks[0]  # fetch the first code block
-            if not text.startswith('\n'):  # in case starting with ```python
-                text = text[max(text.find('\n') + 1, 0):]
-    if text.strip().startswith('from') or text.strip().startswith('import'):
-        def_idx = text.find('def')
-        if def_idx != -1:
-            text = text[max(text.find('\n', def_idx) + 1, 0):]
-    text = text.split('\n\n\n')[0]
-    if text.strip().startswith('def'):
-        text = '\n'.join(text.split('\n')[1:])
-    if not text.startswith('    '):
-        if text.startswith(' '):
-            text = '    ' + text.lstrip()
-        else:
-            text = '\n'.join(['    ' + line for line in text.split('\n')])
+    blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
+    if len(blocks) >= 1:
+        text = blocks[0]
     return text
diff --git a/opencompass/datasets/mathbench.py b/opencompass/datasets/mathbench.py
index 743a5f34..7d9c250a 100644
--- a/opencompass/datasets/mathbench.py
+++ b/opencompass/datasets/mathbench.py
@@ -75,10 +75,10 @@ class MathBenchDataset(BaseDataset):
                             'question': question,
                             'answer': entry['answer'].strip()
                         }
-                        # For PPL evaluation
-                        for i in range(4):
-                            info[chr(ord('A') +
-                                     i)] = entry['options'][i].strip()
+                        # # For PPL evaluation
+                        # for i in range(4):
+                        #     info[chr(ord('A') +
+                        #              i)] = entry['options'][i].strip()
                         data.append(info)
 
         dataset = Dataset.from_list(data)
@@ -104,3 +104,278 @@ def mathbench_postprocess(text: str, name: str) -> str:
         return numbers[0] if split else numbers[-1]
 
     return ans
+
+
+@LOAD_DATASET.register_module()
+class MathBenchBuggyDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str, with_circular: bool = True):
+        data = []
+        filename = osp.join(path, f'{name}.jsonl')
+        with open(filename, 'r', encoding='utf-8') as infile:
+            for id, line in enumerate(infile):
+                entry = json.loads(line)
+                if 'cloze' in name:
+                    data.append({
+                        'question': entry['question'].strip(),
+                        'answer': entry['answer'].strip()
+                    })
+                else:
+                    if with_circular:
+                        data.extend(get_circular_example(entry, id))
+                    else:
+                        question = entry['question'].strip(
+                        ) + '\n' + get_number(entry['options'])
+                        info = {
+                            'question': question,
+                            'answer': entry['answer'].strip()
+                        }
+                        # For PPL evaluation
+                        for i in range(4):
+                            info[chr(ord('A') +
+                                     i)] = entry['options'][i].strip()
+                        data.append(info)
+
+        if 'cloze' not in name:
+            data = data[:(len(data) // 4 + 7) // 8 * 8]
+        dataset = Dataset.from_list(data)
+        return dataset
+
+
+import collections
+
+from ..openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
+from ..registry import ICL_EVALUATORS
+
+
+def first_option_postprocess(text: str, options: str, cushion=True) -> str:
+    """Find first valid option for text."""
+
+    # yapf: disable
+    # flake8: noqa: W605
+    patterns = [
+        f'答案是?\s*([{options}])',
+        f'答案是?\s*：\s*([{options}])',
+        f'答案是?\s*:\s*([{options}])',
+        f'答案应该?是\s*([{options}])',
+        f'答案应该?选\s*([{options}])',
+        f'答案为\s*([{options}])',
+        f'答案选\s*([{options}])',
+        f'选择?\s*([{options}])',
+        f'故选?\s*([{options}])'
+        f'只有选?项?\s?([{options}])\s?是?对',
+        f'只有选?项?\s?([{options}])\s?是?错',
+        f'只有选?项?\s?([{options}])\s?不?正确',
+        f'只有选?项?\s?([{options}])\s?错误',
+        f'说法不?对选?项?的?是\s?([{options}])',
+        f'说法不?正确选?项?的?是\s?([{options}])',
+        f'说法错误选?项?的?是\s?([{options}])',
+        f'([{options}])\s?是正确的',
+        f'([{options}])\s?是正确答案',
+        f'选项\s?([{options}])\s?正确',
+        f'所以答\s?([{options}])',
+        f'所以\s?([{options}][.。$]?$)',
+        f'所有\s?([{options}][.。$]?$)',
+        f'[\s，：:,]([{options}])[。，,\.]?$',
+        f'[\s，,：:][故即]([{options}])[。\.]?$',
+        f'[\s，,：:]因此([{options}])[。\.]?$',
+        f'[是为。]\s?([{options}])[。\.]?$',
+        f'因此\s?([{options}])[。\.]?$',
+        f'显然\s?([{options}])[。\.]?$',
+        f'回答[\s：:]\s?([{options}])',
+        f'Answer[\s：:]\s?([{options}])',
+        f'答案是\s?(\S+)(?:。|$)',
+        f'答案应该是\s?(\S+)(?:。|$)',
+        f'答案为\s?(\S+)(?:。|$)',
+        f'[Tt]he answer is:?\s+\(?([{options}])\)?',
+        f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
+        f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
+        f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
+        f'[Tt]he answer to the question is:?\s+\(?([{options}])\)?',
+    ]
+    cushion_patterns = [
+        f'^选项\s?([{options}])',
+        f'^([{options}])\s?选?项',
+        # f'[\s|^]([{options}])[\s。，,：:\.$]',
+        f'[\s|^]([{options}])[。，,：:\.$]',
+        f'1.\s?([{options}])[.。$]?$',
+        f'([{options}]):',
+        f'([{options}])',
+    ]
+    # flake8: noqa
+    # yapf: enable
+    for pattern in patterns:
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            outputs = match.group(0)
+            for i in options:
+                if i in outputs:
+                    return i, pattern
+    if cushion:
+        for pattern in cushion_patterns:
+            outputs = []
+            current_text = text
+            while True:
+                match = re.search(pattern, current_text, re.DOTALL)
+                if match:
+                    outputs.append(match.group(0))
+                    current_text = current_text[match.end():]
+                else:
+                    break
+            # if len(outputs) >= 2:
+            #     from IPython import embed; embed(); exit()
+            if outputs:
+                outputs = outputs[-1]
+                for i in options:
+                    if i in outputs:
+                        return i, pattern
+    return '', None
+
+
+def remove_invisible_chars(text: str) -> str:
+    """Remove invisible characters."""
+    text = re.sub(r'\s+', '', text)
+    text = re.sub(r'\u200b', '', text)
+    return text
+
+
+@ICL_EVALUATORS.register_module()
+class MathBenchCircularEvaluator(BaseEvaluator):
+    """Robust circular evaluator for multi-choice questions."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.cp4 = ['ABCD', 'BCDA', 'CDAB', 'DABC']
+        self.cp1 = ['ABCD']
+
+    def score(self, predictions, references, test_set):
+        """Calculate the accuracy of predictions.
+
+        Args:
+            predictions (list): List of predictions.
+            references (list): List of references.
+
+        Returns:
+            dict: A dict of evaluation results.
+        """
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+
+        extract_details = {}
+        extracted_predictions = []
+        for index, p in enumerate(predictions):
+            extracted_p = None
+            matched_pattern = None
+            if '\\boxed' in p:
+                match = re.findall(r'\\boxed\{(.*)\}', p)
+                if match:
+                    for m in match:
+                        for j in range(4):
+                            m = remove_invisible_chars(m)
+                            o = remove_invisible_chars(
+                                test_set[index]['options'][j])
+                            if m == o:
+                                extracted_p = chr(ord('A') + j)
+                                matched_pattern = 'boxed_answer'
+                                break
+                        else:
+                            if m in ['A', 'B', 'C', 'D']:
+                                extracted_p = m
+                                matched_pattern = 'boxed_ABCD'
+                            else:
+                                continue
+                        break
+            if extracted_p is None:
+                extracted_p, matched_pattern = first_option_postprocess(
+                    p, 'ABCD')
+            extracted_predictions.append(extracted_p)
+            extract_details[str(index)] = {
+                'question': test_set[index]['question'],
+                'options': test_set[index]['options'],
+                'origin_pred': p,
+                'extracted_pred': extracted_p,
+                'matched_pattern': matched_pattern,
+                'ref': references[index],
+            }
+        predictions = extracted_predictions
+
+        results = {}
+        results.update({'acc_4': 0, 'acc_1': 0})
+        # Accuracy for patterns with no circular shift / 4 circular shifts
+        for index, (pred, reference) in enumerate(zip(predictions,
+                                                      references)):
+            _, ref, circular_pattern = reference.split('--')
+            extract_details[str(index)]['is_correct'] = pred == ref
+            if circular_pattern in self.cp4:
+                results['acc_4'] += 1 if pred == ref else 0
+            if circular_pattern in self.cp1:
+                results['acc_1'] += 1 if pred == ref else 0
+        for k in ['acc_4', 'acc_1']:
+            results[k] = results[k] / len(predictions) * 4 / int(
+                k.split('_')[-1]) * 100
+
+        # Accuracy for patterns with no circular shift / 4 circular shifts
+        details = {4: {}, 1: {}}
+        for pred, reference in zip(predictions, references):
+            index, ref, circular_pattern = reference.split('--')
+            if index not in details[4]:
+                details[4][index] = []
+                details[1][index] = []
+            if circular_pattern in self.cp4:
+                details[4][index].append(True if pred == ref else False)
+            if circular_pattern in self.cp1:
+                details[1][index].append(True if pred == ref else False)
+        # Calculate accuracy for having at least j correct out of i total
+        for i in [1, 4]:
+            for j in range(0, i + 1):
+                count, total = 0, 0
+                for index in details[i]:
+                    if sum(details[i][index]) >= j:
+                        count += 1
+                    total += 1
+                results[f'more_{i}_{j}'] = count / total * 100
+        # Consider fully correct as correct
+        for i in [1, 4]:
+            results[f'perf_{i}'] = results[f'more_{i}_{i}']
+
+        # Calculate voting accuracy
+        voting = {'vote_4': {}, 'vote_1': {}}
+        refs = {}
+        for pred, reference in zip(predictions, references):
+            index, ref, circular_pattern = reference.split('--')
+            c = circular_pattern
+            back_map = {'A': c[0], 'B': c[1], 'C': c[2], 'D': c[3]}
+            ref = back_map[ref]
+            if pred not in ['A', 'B', 'C', 'D']:
+                pred = '-'
+            else:
+                pred = back_map[pred]
+            if index not in voting['vote_4']:
+                voting['vote_4'][index] = collections.Counter()
+                voting['vote_1'][index] = collections.Counter()
+                refs[index] = ref
+
+            if c in self.cp4:
+                voting['vote_4'][index][pred] += 1
+            if c in self.cp1:
+                voting['vote_1'][index][pred] += 1
+        for k in ['vote_4', 'vote_1']:
+            voting_count = 0
+            for index in voting[k]:
+                if refs[index] == voting[k][index].most_common(1)[0][0]:
+                    voting_count += 1
+            results[k] = voting_count / len(voting[k]) * 100
+
+        # Calculate the frequency of ABCD in model predictions
+        prior_counts = {'A': 0, 'B': 0, 'C': 0, 'D': 0, '-': 0}
+        for pred, reference in zip(predictions, references):
+            if pred in ['A', 'B', 'C', 'D']:
+                prior_counts[pred] += 1
+            else:
+                prior_counts['-'] += 1
+        for k in ['A', 'B', 'C', 'D', '-']:
+            results[f'prior_{k}'] = prior_counts[k] / len(predictions) * 100
+
+        results['details'] = extract_details
+        return results
diff --git a/opencompass/datasets/mmlu_pro.py b/opencompass/datasets/mmlu_pro.py
new file mode 100644
index 00000000..b7debbba
--- /dev/null
+++ b/opencompass/datasets/mmlu_pro.py
@@ -0,0 +1,31 @@
+# flake8: noqa
+# yapf: disable
+
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+def _parse(item):
+    choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
+    s = ''
+    for i, opt in enumerate(item['options']):
+        if opt == 'N/A':
+            continue
+        s += '{}. {}\n'.format(choices[i], opt)
+    item['options_str'] = s.strip()
+    item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MMLUProDataset(BaseDataset):
+
+    @staticmethod
+    def load(category: str):
+        mmlu_pro = load_dataset('TIGER-Lab/MMLU-Pro')
+        mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
+        mmlu_pro = mmlu_pro.map(_parse)
+        return mmlu_pro
diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py
index 03acd545..4f952125 100644
--- a/opencompass/datasets/subjective/__init__.py
+++ b/opencompass/datasets/subjective/__init__.py
@@ -2,6 +2,8 @@ from .alignbench import AlignmentBenchDataset  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
 from .compassbench import CompassBenchDataset  # noqa: F401, F403
+from .compassbench_control_length_bias import \
+    CompassBenchControlLengthBiasDataset  # noqa: F401, F403
 from .corev2 import Corev2Dataset  # noqa: F401, F403
 from .creationbench import CreationBenchDataset  # noqa: F401, F403
 from .fofo import FofoDataset  # noqa: F401, F403
diff --git a/opencompass/datasets/subjective/compassbench_control_length_bias.py b/opencompass/datasets/subjective/compassbench_control_length_bias.py
new file mode 100644
index 00000000..991b8b9b
--- /dev/null
+++ b/opencompass/datasets/subjective/compassbench_control_length_bias.py
@@ -0,0 +1,130 @@
+# flake8: noqa
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+base_prompt_zh = """
+请根据 用户问题 以及 相应的两个回答，判断哪一个回答更好。
+[用户问题]
+{question}
+
+[回答1开始]
+{prediction}
+[回答1结束]
+
+回答1中的中文字符数：{prediction_cn_word_count}
+回答1中的英文单词数：{prediction_en_word_count}
+
+[回答2开始]
+{prediction2}
+[回答2结束]
+
+回答2中的中文字符数：{prediction2_cn_word_count}
+回答2中的英文单词数：{prediction2_en_word_count}
+
+请注意：
+1. 若题目中有明确字数限制，打分时应该将字数纳入考虑。如果回答超出或少于规定的字数限制，应相应扣分。
+2. 在没有字数限制的情况下，回答的简洁性和直接性应被优先考虑，除非详细程度对于理解答案至关重要。
+3. 如果两个回答都准确地解决了用户的问题，但一个回答更加简洁，而另一个回答提供了不必要的额外信息，那么简洁的回答可能会得到更高的评分。
+4. 在评分时，还应考虑回答的可读性和用户友好性，例如代码的清晰度和注释的充分性。
+
+请先对两个回答进行评价，最后在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+
+如果你认为回答1更好，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[A]]
+
+如果你认为回答2更好，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[B]]
+
+如果你认为回答1、2打成平手，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[C]]
+""".strip()
+
+base_prompt_en = """
+Please evaluate the two responses based on the user's question and then choose from the following three options:
+A. Response 1 is better
+B. Response 2 is better
+C. Both responses are equal
+
+[user's question]
+{question}
+
+[Response 1 Start]
+{prediction}
+[Response 1 End]
+
+The number of Chinese characters in Response 1: {prediction_cn_word_count}
+The number of English words in Response 1: {prediction_en_word_count}
+
+[Response 2 Start]
+{prediction2}
+[Response 2 End]
+
+The number of Chinese characters in Response 2: {prediction2_cn_word_count}
+The number of English words in Response 2: {prediction2_en_word_count}
+
+Note:
+
+1. If there is a clear word limit in the question, the word count should be taken into consideration when scoring. If the answer exceeds or falls short of the specified word limit, points should be deducted accordingly.
+2. In the absence of a word limit, the conciseness and directness of the answer should be given priority, unless the level of detail is essential for understanding the answer.
+3. If both answers accurately solve the user's question, but one is more concise and the other provides unnecessary additional information, the concise answer may receive a higher score.
+4. When scoring, the readability and user-friendliness of the answer should also be considered, such as the clarity of the code and the adequacy of the comments.
+
+
+If you believe that Response 1 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[A]]
+
+If you believe that Response 2 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[B]]
+
+If you believe that both responses are equally good, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[C]]
+""".strip()
+
+
+@LOAD_DATASET.register_module()
+class CompassBenchControlLengthBiasDataset(BaseDataset):
+
+    def load(self, path: str, name: str):
+        filename = osp.join(path, f'{name}.json')
+        raw_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            json_data = json.load(f)
+            for problem in json_data:
+                question = problem['question']
+                lan = problem['language']
+                others = problem['others']
+                judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
+                judge_prompt = judge_prompt.replace('{question}', question)
+                raw_data.append({
+                    'question': question,
+                    'judge_prompt': judge_prompt,
+                    'judge': {
+                        'lan': lan,
+                        'level': others['level'],
+                        'category': problem['category'],
+                        'question': question
+                    }
+                })
+        dataset = Dataset.from_list(raw_data)
+        return dataset
diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py
index 1627efad..2a7317b6 100644
--- a/opencompass/datasets/subjective/mtbench101.py
+++ b/opencompass/datasets/subjective/mtbench101.py
@@ -5,6 +5,7 @@ import re
 from typing import Optional
 
 from datasets import Dataset, DatasetDict
+from torch.utils.data import DataLoader
 
 from opencompass.registry import LOAD_DATASET
 
diff --git a/opencompass/datasets/taco.py b/opencompass/datasets/taco.py
index 7d548477..fc4e89be 100644
--- a/opencompass/datasets/taco.py
+++ b/opencompass/datasets/taco.py
@@ -234,7 +234,8 @@ class TACOEvaluator(BaseEvaluator):
         return pass_at_k
 
     def score(self, predictions, references, test_set):
-        assert len(predictions) == len(references)
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
         generations = defaultdict(list)
         for refer, pred in zip(references, predictions):
             pred = self.post_process(pred)
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index f41ff5d3..41ba0c2e 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -1,13 +1,14 @@
 from .accessory import LLaMA2AccessoryModel  # noqa: F401
 from .ai360_api import AI360GPT  # noqa: F401
 from .alaya import AlayaLM  # noqa: F401
-from .baichuan_api import BaiChuan, BaiChuan3  # noqa: F401
+from .baichuan_api import BaiChuan  # noqa: F401
 from .baidu_api import ERNIEBot  # noqa: F401
 from .base import BaseModel, LMTemplateParser  # noqa: F401
 from .base_api import APITemplateParser, BaseAPIModel  # noqa: F401
 from .bytedance_api import ByteDance  # noqa: F401
 from .claude_api import Claude  # noqa: F401
 from .deepseek_api import DeepseekAPI  # noqa: F401
+from .doubao import Doubao  # noqa: F401
 from .gemini_api import Gemini  # noqa: F401
 from .glm import GLM130B  # noqa: F401
 from .huggingface import HuggingFace  # noqa: F401
diff --git a/opencompass/models/ai360_api.py b/opencompass/models/ai360_api.py
index 223a6d79..49b1c71e 100644
--- a/opencompass/models/ai360_api.py
+++ b/opencompass/models/ai360_api.py
@@ -125,10 +125,16 @@ class AI360GPT(BaseAPIModel):
         while max_num_retries < self.retry:
             self.acquire()
             # payload = json.dumps(data)
-            raw_response = requests.request('POST',
-                                            url=self.url,
-                                            headers=self.headers,
-                                            json=data)
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                headers=self.headers,
+                                                json=data)
+            except Exception as e:
+                self.release()
+                print(e)
+                max_num_retries += 1
+                continue
             response = raw_response.json()
             self.release()
 
diff --git a/opencompass/models/baichuan_api.py b/opencompass/models/baichuan_api.py
index 9bca78e6..a29a641f 100644
--- a/opencompass/models/baichuan_api.py
+++ b/opencompass/models/baichuan_api.py
@@ -33,29 +33,26 @@ class BaiChuan(BaseAPIModel):
     def __init__(
         self,
         path: str,
-        api_key: str,
+        key: str,
         url: str,
         query_per_second: int = 2,
         max_seq_len: int = 2048,
         meta_template: Optional[Dict] = None,
         retry: int = 2,
-        generation_kwargs: Dict = {
-            'temperature': 0.3,
-            'top_p': 0.85,
-            'top_k': 5,
-            'with_search_enhance': False,
-            'stream': False,
-        }):  # noqa E125
+        system_prompt: str = '',
+    ):
         super().__init__(path=path,
                          max_seq_len=max_seq_len,
                          query_per_second=query_per_second,
                          meta_template=meta_template,
-                         retry=retry,
-                         generation_kwargs=generation_kwargs)
-
-        self.api_key = api_key
+                         retry=retry)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': 'Bearer ' + key,
+        }
         self.url = url
         self.model = path
+        self.system_prompt = system_prompt
 
     def generate(
         self,
@@ -96,29 +93,33 @@ class BaiChuan(BaseAPIModel):
         Returns:
             str: The generated string.
         """
-
         assert isinstance(input, (str, PromptList))
 
         if isinstance(input, str):
             messages = [{'role': 'user', 'content': input}]
         else:
             messages = []
+            msg_buffer, last_role = [], None
             for item in input:
-                msg = {'content': item['prompt']}
-                if item['role'] == 'HUMAN':
-                    msg['role'] = 'user'
-                elif item['role'] == 'BOT':
-                    msg['role'] = 'assistant'
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
 
-                messages.append(msg)
+        if self.system_prompt:
+            system = {'role': 'system', 'content': self.system_prompt}
+            messages.insert(0, system)
 
         data = {'model': self.model, 'messages': messages}
-        data.update(self.generation_kwargs)
-
-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': 'Bearer ' + self.api_key,
-        }
 
         max_num_retries = 0
         while max_num_retries < self.retry:
@@ -126,17 +127,20 @@ class BaiChuan(BaseAPIModel):
             try:
                 raw_response = requests.request('POST',
                                                 url=self.url,
-                                                headers=headers,
+                                                headers=self.headers,
                                                 json=data)
-                response = raw_response.json()
             except Exception as err:
                 print('Request Error:{}'.format(err))
-                time.sleep(3)
+                time.sleep(2)
                 continue
 
+            try:
+                response = raw_response.json()
+            except Exception as err:
+                print('Response Error:{}'.format(err))
+                response = None
             self.release()
-            # print(response.keys())
-            # print(response['choices'][0]['message']['content'])
+
             if response is None:
                 print('Connection error, reconnect.')
                 # if connect error, frequent requests will casuse
@@ -144,140 +148,32 @@ class BaiChuan(BaseAPIModel):
                 # to slow down the request
                 self.wait()
                 continue
+
             if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
                 msg = response['choices'][0]['message']['content']
                 self.logger.debug(f'Generated: {msg}')
                 return msg
 
-            if raw_response.status_code != 200:
-                print(raw_response.json())
-                time.sleep(1)
+            if raw_response.status_code == 401:
+                print('请求被拒绝 api_key错误')
                 continue
-            print(response)
+            elif raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
             max_num_retries += 1
 
-        raise RuntimeError(response)
-
-
-class BaiChuan3(BaseAPIModel):
-
-    def __init__(
-        self,
-        path: str,
-        api_key: str,
-        url: str,
-        query_per_second: int = 2,
-        max_seq_len: int = 2048,
-        meta_template: Optional[Dict] = None,
-        retry: int = 2,
-    ):  # noqa E125
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         query_per_second=query_per_second,
-                         meta_template=meta_template,
-                         retry=retry)
-
-        self.api_key = api_key
-        self.url = url
-        self.model = path
-
-    def generate(
-        self,
-        inputs: List[PromptType],
-        max_out_len: int = 512,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[PromptType]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs)))
-        self.flush()
-        return results
-
-    def _generate(
-        self,
-        input: PromptType,
-        max_out_len: int = 512,
-    ) -> str:
-        """Generate results given an input.
-
-        Args:
-            inputs (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            str: The generated string.
-        """
-
-        assert isinstance(input, (str, PromptList))
-
-        if isinstance(input, str):
-            history = []
-            prompt = input
-        else:
-            messages = []
-            msg_buffer, last_role = [], None
-            for item in input:
-                role = 'BOT' if item['role'] == 'BOT' else 'USER'
-                if role != last_role and last_role is not None:
-                    messages.append({
-                        'data': '\n'.join(msg_buffer),
-                        'from': 0 if last_role == 'USER' else 1
-                    })
-                    msg_buffer = []
-                msg_buffer.append(item['prompt'])
-                last_role = role
-            messages.append({
-                'data': '\n'.join(msg_buffer),
-                'from': 0 if last_role == 'USER' else 1
-            })
-            history = messages[:-1]
-            prompt = messages[-1]['data']
-
-        data = {
-            'access_token_key': self.api_key,
-            'app_info': {
-                'id': 123
-            },
-            'prompt': {
-                'data': prompt
-            },
-            'history': history,
-        }
-
-        for _ in range(self.retry):
-            try:
-                response = requests.post(self.url, json=data)
-            except Exception as e:
-                print(e)
-                continue
-            if response is None or response.status_code != 200:
-                code = response.status_code if response else -1
-                print(f'[chat_api]-[failed] request err, status_code: {code}')
-                continue
-            try:
-                response = response.json()
-            except Exception as e:
-                print(e)
-                continue
-            print(response)
-            status = response.get('answer', {}).get('status', 0)
-            session_status = response.get('session_info', {}).get('status', 0)
-            if status < 0 or session_status < 0:
-                print('[chat_api]-[warn] prompt or answer is unsafe')
-                return 'Rejection: unsafe prompt or answer'
-            return response.get('answer', {}).get('data', '')
-
-        raise RuntimeError(response['msg'])
+        raise RuntimeError(raw_response)
diff --git a/opencompass/models/baidu_api.py b/opencompass/models/baidu_api.py
index 51d81ff5..f1a126ef 100644
--- a/opencompass/models/baidu_api.py
+++ b/opencompass/models/baidu_api.py
@@ -147,6 +147,8 @@ class ERNIEBot(BaseAPIModel):
             messages = []
             msg_buffer, last_role = [], None
             for item in input:
+                if not item['prompt']:
+                    continue
                 if item['role'] == 'BOT':
                     role = 'assistant'
                 else:  # USER or SYSTEM
diff --git a/opencompass/models/doubao.py b/opencompass/models/doubao.py
new file mode 100644
index 00000000..83273770
--- /dev/null
+++ b/opencompass/models/doubao.py
@@ -0,0 +1,110 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class Doubao(BaseAPIModel):
+
+    def __init__(
+        self,
+        path: str,
+        endpoint_id: str,
+        access_key: str,
+        secret_key: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+    ):  # noqa E125
+        super().__init__(
+            path=path,
+            max_seq_len=max_seq_len,
+            query_per_second=query_per_second,
+            meta_template=meta_template,
+            retry=retry,
+        )
+        self.endpoint_id = endpoint_id
+        self.access_key = access_key
+        self.secret_key = secret_key
+        try:
+            from volcenginesdkarkruntime import Ark
+        except ImportError:
+            self.logger.error(
+                'To use the Doubao API, you need to install sdk with '
+                '`pip3 install volcengine-python-sdk`')
+
+        self.client = Ark(ak=self.access_key, sk=self.secret_key)
+
+    def generate(self,
+                 inputs: List[PromptType],
+                 max_out_len: int = 512) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(self, input: PromptType, max_out_len: int = 512) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                msg = {'content': item['prompt']}
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    msg['role'] = 'system'
+                else:
+                    raise ValueError(f'Invalid role: {item["role"]}')
+                messages.append(msg)
+
+        data = dict(model=self.endpoint_id, messages=messages)
+
+        for _ in range(self.retry):
+            try:
+                completion = self.client.chat.completions.create(**data)
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+                continue
+
+            generated = completion.choices[0].message.content
+            self.logger.debug(f'Generated: {generated}')
+            return completion.choices[0].message.content
+
+        raise RuntimeError(f'Failed to respond in {self.retry} retrys')
diff --git a/opencompass/models/huggingface_above_v4_33.py b/opencompass/models/huggingface_above_v4_33.py
index ae13ae10..329ea2a3 100644
--- a/opencompass/models/huggingface_above_v4_33.py
+++ b/opencompass/models/huggingface_above_v4_33.py
@@ -12,35 +12,32 @@ PromptType = Union[PromptList, str]
 
 
 def _get_stopping_criteria(stop_words, tokenizer, batch_size):
-    from transformers import (PreTrainedTokenizer, StoppingCriteria,
-                              StoppingCriteriaList)
+    from transformers import StoppingCriteria, StoppingCriteriaList
 
     class MultiTokenEOSCriteria(StoppingCriteria):
         """Criteria to stop on the specified multi-token sequence."""
 
-        def __init__(self, sequence: str, tokenizer: PreTrainedTokenizer, batch_size: int):
+        def __init__(self, stop_words: List[str], tokenizer, batch_size: int):
             self.done_tracker = [False] * batch_size
-            self.sequence = sequence
-            self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-            self.sequence_id_len = len(self.sequence_ids)
+            self.stop_words, self.max_sequence_id_len = [], 0
+            for s in stop_words:
+                self.stop_words.append(s)
+                sequence_ids = tokenizer.encode(s, add_special_tokens=False)
+                self.max_sequence_id_len = max(self.max_sequence_id_len, len(sequence_ids))
             self.tokenizer = tokenizer
 
         def __call__(self, input_ids, scores, **kwargs) -> bool:
             # compare the last len(stop) tokens
-            lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
+            lookback_ids_batch = input_ids[:, -self.max_sequence_id_len:]
             lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
             for i, done in enumerate(self.done_tracker):
                 if done:
                     continue
-                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+                self.done_tracker[i] = any(s in lookback_tokens_batch[i] for s in self.stop_words)
             return False not in self.done_tracker
 
-    criteria = []
-    for stop_word in stop_words:
-        c = MultiTokenEOSCriteria(stop_word, tokenizer, batch_size)
-        criteria.append(c)
-    criteria = StoppingCriteriaList(criteria)
-    return criteria
+    c = MultiTokenEOSCriteria(stop_words, tokenizer, batch_size)
+    return StoppingCriteriaList([c])
 
 def _get_possible_max_seq_len(max_seq_len, path):
     if max_seq_len is not None:
@@ -59,7 +56,7 @@ def _get_possible_max_seq_len(max_seq_len, path):
     raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.')
 
 
-def _convert_chat_messages(inputs, merge_role=True):
+def _convert_chat_messages(inputs, merge_role=True, skip_empty_prompt=True):
     outputs = []
     for _input in inputs:
         messages = []
@@ -67,6 +64,8 @@ def _convert_chat_messages(inputs, merge_role=True):
             messages.append({'role': 'user', 'content': _input})
         else:
             for item in _input:
+                if skip_empty_prompt and not item['prompt']:
+                    continue
                 role = {
                     'HUMAN': 'user',
                     'BOT': 'assistant',
@@ -169,6 +168,7 @@ class HuggingFacewithChatTemplate(BaseModel):
         self.generation_kwargs = generation_kwargs
         self.fastchat_template = fastchat_template
         self.stop_words = list(set(stop_words + self._get_potential_stop_words(path)))
+        self.logger.info(f'using stop words: {self.stop_words}')
 
         for k, v in other_kwargs.items():
             if v is not None:
@@ -231,12 +231,19 @@ class HuggingFacewithChatTemplate(BaseModel):
         potential_stop_words = []
         try:
             generation_config = GenerationConfig.from_pretrained(path)
-            for token_id in generation_config.eos_token_id:
-                potential_stop_words.append(self.tokenizer.decode(token_id))
         except:
-            pass
-        potential_stop_words.append(self.tokenizer.eos_token)
+            generation_config = None
+        if generation_config and hasattr(generation_config, 'eos_token_id'):
+            if isinstance(generation_config.eos_token_id, int):
+                potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
+            else:
+                assert isinstance(generation_config.eos_token_id, list)
+                for token_id in generation_config.eos_token_id:
+                    potential_stop_words.append(self.tokenizer.decode(token_id))
+        if self.tokenizer.eos_token is not None:
+            potential_stop_words.append(self.tokenizer.eos_token)
         potential_stop_words = list(set(potential_stop_words))
+        potential_stop_words = [s for s in potential_stop_words if s]
         return potential_stop_words
 
     def generate(self,
diff --git a/opencompass/models/hunyuan_api.py b/opencompass/models/hunyuan_api.py
index 07b26252..a5f241a6 100644
--- a/opencompass/models/hunyuan_api.py
+++ b/opencompass/models/hunyuan_api.py
@@ -1,4 +1,5 @@
 import json
+import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
@@ -33,22 +34,21 @@ class Hunyuan(BaseAPIModel):
         self.secret_id = secret_id
         self.secret_key = secret_key
         self.endpoint = endpoint
+        self.model = path
 
+        # pip install tencentcloud-sdk-python
         from tencentcloud.common import credential
-        from tencentcloud.common.common_client import CommonClient
         from tencentcloud.common.profile.client_profile import ClientProfile
         from tencentcloud.common.profile.http_profile import HttpProfile
+        from tencentcloud.hunyuan.v20230901 import hunyuan_client
 
         cred = credential.Credential(self.secret_id, self.secret_key)
         httpProfile = HttpProfile()
         httpProfile.endpoint = self.endpoint
         clientProfile = ClientProfile()
         clientProfile.httpProfile = httpProfile
-        self.client = CommonClient('hunyuan',
-                                   '2023-09-01',
-                                   cred,
-                                   'ap-beijing',
-                                   profile=clientProfile)
+        self.client = hunyuan_client.HunyuanClient(cred, 'ap-beijing',
+                                                   clientProfile)
 
     def generate(self,
                  inputs: List[PromptType],
@@ -87,35 +87,65 @@ class Hunyuan(BaseAPIModel):
         assert isinstance(input, (str, PromptList))
 
         if isinstance(input, str):
-            messages = [{'role': 'user', 'content': input}]
+            messages = [{'Role': 'user', 'Content': input}]
         else:
             messages = []
+            msg_buffer, last_role = [], None
             for item in input:
-                msg = {'Content': item['prompt']}
-                if item['role'] == 'HUMAN':
-                    msg['Role'] = 'user'
-                elif item['role'] == 'BOT':
-                    msg['Role'] = 'assistant'
-                messages.append(msg)
+                if not item['prompt']:
+                    continue
+                if item['role'] == 'BOT':
+                    role = 'assistant'
+                else:  # USER or SYSTEM
+                    role = 'user'
+                if role != last_role and last_role is not None:
+                    messages.append({
+                        'Content': '\n'.join(msg_buffer),
+                        'Role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = role
+            messages.append({
+                'Content': '\n'.join(msg_buffer),
+                'Role': last_role
+            })
+            messages = messages[-40:]
+            if messages[0]['Role'] == 'assistant':
+                messages = messages[1:]
 
         from tencentcloud.common.exception.tencent_cloud_sdk_exception import \
             TencentCloudSDKException
+        from tencentcloud.hunyuan.v20230901 import models
 
-        data = {'Messages': messages}
+        data = {'Model': self.model, 'Messages': messages}
 
-        for _ in range(self.retry):
+        retry_counter = 0
+        while retry_counter < self.retry:
             try:
-                resp = self.client.call_sse('ChatPro', data)
-                contents = []
-                for event in resp:
-                    part = json.loads(event['data'])
-                    contents.append(part['Choices'][0]['Delta']['Content'])
-                answer = ''.join(contents)
+                req = models.ChatCompletionsRequest()
+                req.from_json_string(json.dumps(data))
+                resp = self.client.ChatCompletions(req)
+                resp = json.loads(resp.to_json_string())
+                answer = resp['Choices'][0]['Message']['Content']
 
-            except TencentCloudSDKException as err:
-                print(err)
+            except TencentCloudSDKException as e:
+                self.logger.error(f'Got error code: {e.get_code()}')
+                if e.get_code() == 'ClientNetworkError':
+                    return 'client network error'
+                elif e.get_code() in ['InternalError', 'ServerNetworkError']:
+                    retry_counter += 1
+                    continue
+                elif e.get_code() in ['LimitExceeded']:
+                    time.sleep(5)
+                    continue
+                else:
+                    print(e)
+                    from IPython import embed
+                    embed()
+                    exit()
 
-            print(answer)
+            self.logger.debug(f'Generated: {answer}')
             return answer
 
         raise RuntimeError(f'Failed to respond in {self.retry} retrys')
diff --git a/opencompass/models/minimax_api.py b/opencompass/models/minimax_api.py
index 44ea267a..b8f803fd 100644
--- a/opencompass/models/minimax_api.py
+++ b/opencompass/models/minimax_api.py
@@ -322,11 +322,14 @@ class MiniMaxChatCompletionV2(BaseAPIModel):
                     code = response.get('base_resp', {}).get('status_code')
                     if code == 1002:
                         # rate limit
+                        self.logger.debug('Rate limit, wait for 1s')
                         time.sleep(1)
                         continue
+                    elif code == 1026:
+                        return 'The request was rejected because new risk'
                     elif code == 1027:
                         return 'The request was rejected because high risk'
-                    print(messages, response)
+                    self.logger.debug(f'Resp 200, Error: {response}')
                     pass
 
             elif raw_response.status_code == 401:
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 5497f4c2..d79ce7d7 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -241,7 +241,12 @@ class OpenAI(BaseAPIModel):
                     stop=None,
                     temperature=temperature,
                 )
-                raw_response = requests.post(self.url,
+                if isinstance(self.url, list):
+                    import random
+                    url = self.url[random.randint(0, len(self.url) - 1)]
+                else:
+                    url = self.url
+                raw_response = requests.post(url,
                                              headers=header,
                                              data=json.dumps(data))
             except requests.ConnectionError:
@@ -272,6 +277,9 @@ class OpenAI(BaseAPIModel):
                     elif response['error']['code'] == 'invalid_prompt':
                         self.logger.warn('Invalid prompt:', str(input))
                         return ''
+                    elif response['error']['type'] == 'invalid_prompt':
+                        self.logger.warn('Invalid prompt:', str(input))
+                        return ''
 
                     self.logger.error('Find error message in response: ',
                                       str(response['error']))
diff --git a/opencompass/models/sensetime_api.py b/opencompass/models/sensetime_api.py
index fd56e983..328881b3 100644
--- a/opencompass/models/sensetime_api.py
+++ b/opencompass/models/sensetime_api.py
@@ -104,14 +104,23 @@ class SenseTime(BaseAPIModel):
             messages = [{'role': 'user', 'content': input}]
         else:
             messages = []
+            msg_buffer, last_role = [], None
             for item in input:
-                msg = {'content': item['prompt']}
-                if item['role'] == 'HUMAN':
-                    msg['role'] = 'user'
-                elif item['role'] == 'BOT':
-                    msg['role'] = 'assistant'
-
-                messages.append(msg)
+                if not item['prompt']:
+                    continue
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
 
         data = {'messages': messages, 'model': self.model}
         if self.params is not None:
@@ -157,6 +166,9 @@ class SenseTime(BaseAPIModel):
                         return 'error:too long'
                     else:
                         print(raw_response.text)
+                        from IPython import embed
+                        embed()
+                        exit()
                         time.sleep(1)
                         continue
             else:
@@ -198,7 +210,4 @@ class SenseTime(BaseAPIModel):
                     time.sleep(1)
                     continue
 
-        return ''
-        raise RuntimeError(
-            f'request id: '
-            f'{raw_response.headers.get("X-Request-Id")}, {raw_response.text}')
+        raise RuntimeError
diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py
index 56aa5430..5ea020a5 100644
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@@ -72,12 +72,19 @@ class TurboMindModelwithChatTemplate(BaseModel):
         potential_stop_words = []
         try:
             generation_config = GenerationConfig.from_pretrained(path)
-            for token_id in generation_config.eos_token_id:
-                potential_stop_words.append(self.origin_tokenizer.decode(token_id))
         except:
-            pass
-        potential_stop_words.append(self.origin_tokenizer.eos_token)
+            generation_config = None
+        if generation_config and hasattr(generation_config, 'eos_token_id'):
+            if isinstance(generation_config.eos_token_id, int):
+                potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id))
+            else:
+                assert isinstance(generation_config.eos_token_id, list)
+                for token_id in generation_config.eos_token_id:
+                    potential_stop_words.append(self.origin_tokenizer.decode(token_id))
+        if self.origin_tokenizer.eos_token is not None:
+            potential_stop_words.append(self.origin_tokenizer.eos_token)
         potential_stop_words = list(set(potential_stop_words))
+        potential_stop_words = [s for s in potential_stop_words if s]
         return potential_stop_words
 
     def generate(self,
@@ -119,12 +126,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
         if do_sample:
             gen_config['top_k'] = 1000
             gen_config['temperature'] = temperature
-        # if stopping_criteria:
-        #     stop_words = gen_config.get('stop_words', [])
-        #     for t in stopping_criteria:
-        #         t = self.tokenizer.encode(t, add_bos=False)
-        #         stop_words.append(t[0])
-        #     gen_config['stop_words'] = list(set(stop_words))
+
         from lmdeploy.messages import EngineGenerationConfig, GenerationConfig
         gen_config = GenerationConfig(**gen_config)
         gen_config = EngineGenerationConfig.From(gen_config, self.tokenizer)
@@ -165,7 +167,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
         """
         assert type(prompt) is str, 'We only support string for TurboMind Python API'
 
-        input_ids = self.tokenizer.encode(prompt)
+        input_ids = self.tokenizer.encode(prompt, add_bos=False)
         for outputs in generator.stream_infer(session_id=session_id,
                                               input_ids=[input_ids],
                                               gen_config=gen_config,
diff --git a/opencompass/models/vllm.py b/opencompass/models/vllm.py
index 3919e2e1..ab042437 100644
--- a/opencompass/models/vllm.py
+++ b/opencompass/models/vllm.py
@@ -126,11 +126,22 @@ class VLLM(BaseModel):
             ]
             prompt_logprobs_list = [i.logprob for i in prompt_logprobs_list]
             prompt_logprobs_list = np.array(prompt_logprobs_list)
+            if mask_length is not None:
+                prompt_logprobs_list = prompt_logprobs_list[-mask_length[i]:]
             loss = -prompt_logprobs_list.sum(axis=-1) / len(prompt_token_ids)
             ce_loss.append(loss)
         return np.array(ce_loss)
 
-    def get_token_len(self, prompt: str) -> int:
+    def get_loglikelihood(self, inputs: List[str],
+                          conts: List[str]) -> List[float]:
+        mask_length = [
+            self.get_token_len(c, add_special_tokens=False) for c in conts
+        ]
+        return -self.get_ppl(inputs, mask_length)
+
+    def get_token_len(self,
+                      prompt: str,
+                      add_special_tokens: bool = True) -> int:
         """Get lengths of the tokenized strings.
 
         Args:
@@ -139,4 +150,7 @@ class VLLM(BaseModel):
         Returns:
             int: Length of the input tokens
         """
-        return len(self.model.get_tokenizer().encode(prompt))
+        tokenizer = self.model.get_tokenizer()
+        token_ids = tokenizer.encode(prompt,
+                                     add_special_tokens=add_special_tokens)
+        return len(token_ids)
diff --git a/opencompass/models/vllm_with_tf_above_v4_33.py b/opencompass/models/vllm_with_tf_above_v4_33.py
index 87421ace..cf79ea6f 100644
--- a/opencompass/models/vllm_with_tf_above_v4_33.py
+++ b/opencompass/models/vllm_with_tf_above_v4_33.py
@@ -68,12 +68,19 @@ class VLLMwithChatTemplate(BaseModel):
         potential_stop_words = []
         try:
             generation_config = GenerationConfig.from_pretrained(path)
-            for token_id in generation_config.eos_token_id:
-                potential_stop_words.append(self.tokenizer.decode(token_id))
         except:
-            pass
-        potential_stop_words.append(self.tokenizer.eos_token)
+            generation_config = None
+        if generation_config and hasattr(generation_config, 'eos_token_id'):
+            if isinstance(generation_config.eos_token_id, int):
+                potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
+            else:
+                assert isinstance(generation_config.eos_token_id, list)
+                for token_id in generation_config.eos_token_id:
+                    potential_stop_words.append(self.tokenizer.decode(token_id))
+        if self.tokenizer.eos_token is not None:
+            potential_stop_words.append(self.tokenizer.eos_token)
         potential_stop_words = list(set(potential_stop_words))
+        potential_stop_words = [s for s in potential_stop_words if s]
         return potential_stop_words
 
     def generate(self, inputs: List[str], max_out_len: int, stopping_criteria: List[str] = [], **kwargs) -> List[str]:
diff --git a/opencompass/models/xunfei_api.py b/opencompass/models/xunfei_api.py
index 8de6f607..b3420f4f 100644
--- a/opencompass/models/xunfei_api.py
+++ b/opencompass/models/xunfei_api.py
@@ -287,6 +287,7 @@ class XunFeiSpark(BaseAPIModel):
         assert isinstance(input, (str, PromptList))
 
         from sparkai.core.messages import ChatMessage
+        from sparkai.errors import SparkAIConnectionError
         from sparkai.llm.llm import ChatSparkLLM
 
         if isinstance(input, str):
@@ -334,28 +335,34 @@ class XunFeiSpark(BaseAPIModel):
                 msg = outputs[0].text
                 self.logger.debug(f'Generated: {msg}')
                 return msg
-            except ConnectionError as e:
-                match = re.match(r'Error Code: (\d+), Error: (.*)',
-                                 e.args[0],
-                                 flags=re.DOTALL)
-                if match:
-                    error_code = int(match.group(1))
-                    msg = match.group(2)
-                    if error_code == 10003:  # query data exceed limit
-                        self.logger.error(f'Error {error_code}: {msg}')
-                        return msg
-                    elif error_code in [10013, 10014]:  # skip safety problem
-                        self.logger.debug(f'Generated: {msg}')
-                        return msg
-                    elif error_code == 10020:  # plugin result is empty
-                        self.logger.error(f'Error {error_code}: {msg}')
-                        return msg
-                    elif error_code == 11202:  # qps limit
-                        time.sleep(1)
-                    else:
-                        self.logger.error(f'Error {error_code}: {msg}')
+            except (ConnectionError, SparkAIConnectionError) as e:
+                if isinstance(e, SparkAIConnectionError):
+                    error_code = e.error_code
+                    message = e.message
+                else:
+                    match = re.match(r'Error Code: (\d+), Error: (.*)',
+                                     e.args[0],
+                                     flags=re.DOTALL)
+                    if not match:
                         raise e
-                raise e
+                    error_code = int(match.group(1))
+                    message = match.group(2)
+
+                if error_code == 10003:  # query data exceed limit
+                    self.logger.error(f'Error {error_code}: {message}')
+                    return message
+                elif error_code in [10013, 10014]:  # skip safety problem
+                    self.logger.debug(f'Generated: {message}')
+                    return message
+                elif error_code == 10020:  # plugin result is empty
+                    self.logger.error(f'Error {error_code}: {message}')
+                    return message
+                elif error_code == 11202:  # qps limit
+                    time.sleep(1)
+                    continue
+                else:
+                    self.logger.error(f'Error {error_code}: {message}')
+                    raise e
             except TimeoutError:
                 self.logger.error('TimeoutError, sleep 60, retrying...')
                 time.sleep(60)
diff --git a/opencompass/models/yi_api.py b/opencompass/models/yi_api.py
index d9bc70a7..61017a25 100644
--- a/opencompass/models/yi_api.py
+++ b/opencompass/models/yi_api.py
@@ -118,7 +118,11 @@ class YiAPI(BaseAPIModel):
             system = {'role': 'system', 'content': self.system_prompt}
             messages.insert(0, system)
 
-        data = {'model': self.model, 'messages': messages}
+        data = {
+            'model': self.model,
+            'messages': messages,
+            'max_tokens': max_out_len
+        }
 
         max_num_retries = 0
         while max_num_retries < self.retry:
diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py
index 5d903738..6a33666b 100644
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@@ -1,6 +1,8 @@
 # flake8: noqa: E501
+# yapf: disable
 import os.path as osp
 import random
+import re
 from typing import Dict, List, Optional
 
 import mmengine
@@ -25,12 +27,7 @@ def extract_dicts(data):
     return predictions
 
 
-def order_preds_and_record_references(
-    predictions,
-    references,
-    infer_order,
-    seed=666,
-):
+def order_preds_and_record_references(predictions, references, infer_order, seed=666):
     """Order predictions based on args and recording regrading references.
 
     Args:
@@ -42,8 +39,7 @@ def order_preds_and_record_references(
     random.seed(seed)
     list_of_preds = [[] for _ in range(len(predictions))]
     for i in range(len(predictions[0]['model_preds'])):
-        preds = [[pred['model_preds'][i], pred['model_name']]
-                 for pred in predictions]
+        preds = [[pred['model_preds'][i], pred['model_name']] for pred in predictions]
         if infer_order == 'random':
             random.shuffle(preds)
         for j in range(len(preds)):
@@ -51,19 +47,26 @@ def order_preds_and_record_references(
             references[i][f'answer{j+1}'] = preds[j][1]
     if infer_order == 'double':
         assert len(predictions) == 2
-        list_of_preds = [
-            a + b for a, b in zip(list_of_preds, reversed(list_of_preds))
-        ]
+        list_of_preds = [a + b for a, b in zip(list_of_preds, reversed(list_of_preds))]
         reversed_references = []
         for item in references:
             reversed_item = item.copy()
-            reversed_item['answer1'], reversed_item['answer2'] = reversed_item[
-                'answer2'], reversed_item['answer1']
+            reversed_item['answer1'], reversed_item['answer2'] = reversed_item['answer2'], reversed_item['answer1']
             reversed_references.append(reversed_item)
         references += reversed_references
     return list_of_preds, references
 
 
+def count_chinese_characters(text):
+    words = re.findall(r'[\u4e00-\u9fff]', text)
+    return len(words)
+
+
+def count_english_words(text):
+    words = re.findall(r'\b[a-zA-Z]+\b', text)
+    return len(words)
+
+
 class LMEvaluator:
     """Evaluate output with language model.
 
@@ -99,8 +102,7 @@ class LMEvaluator:
 
         self.prompt_tmpl = ICL_PROMPT_TEMPLATES.build(prompt_template)
         if meta_review_prompt_template is not None:
-            self.meta_review_prompt_tmpl = ICL_PROMPT_TEMPLATES.build(
-                meta_review_prompt_template)
+            self.meta_review_prompt_tmpl = ICL_PROMPT_TEMPLATES.build(meta_review_prompt_template)
 
         max_out_len = judge_cfg.get('max_out_len', None)
         batch_size = judge_cfg.get('batch_size', None)
@@ -123,10 +125,9 @@ class LMEvaluator:
         dup_indices = []
         if isinstance(predictions, list):
             """Apply to multi-model comparison."""
-            references = [{} for _ in range(len(predictions[0]['model_preds']))
-                          ] if references is None else references
-            predictions, references = order_preds_and_record_references(
-                predictions, references, infer_order)
+            if references is None:
+                references = [{} for _ in range(len(predictions[0]['model_preds']))]
+            predictions, references = order_preds_and_record_references(predictions, references, infer_order)
 
             # calculate dupicated predictions numbers
             total_predictions_num = len(predictions[0])
@@ -140,8 +141,8 @@ class LMEvaluator:
 
         elif isinstance(predictions, dict):
             """Apply to single-model scoring."""
-            references = [{} for _ in range(len(predictions[0]['model_preds']))
-                          ] if references is None else references
+            if references is None:
+                references = [{} for _ in range(len(predictions[0]['model_preds']))]
             predictions = [predictions['model_preds']]
 
         # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
@@ -155,25 +156,23 @@ class LMEvaluator:
                 del references[index]
 
         pred_dict = {}
-        if isinstance(
-                predictions[0][0], str
-        ):  #single chat for format like [['xxx', 'xxxx'], ['xxx', 'xxxx']]
+        if isinstance(predictions[0][0], str):
+            # single chat for format like [['xxx', 'xxxx'], ['xxx', 'xxxx']]
             for i in range(len(predictions)):
                 key = 'prediction' if i == 0 else f'prediction{i + 1}'
                 gold_key = 'obj_gold'
                 pred_dict[key] = predictions[i]
                 pred_dict[gold_key] = references
+                pred_dict[key + '_en_word_count'] = [count_english_words(j) for j in predictions[i]]
+                pred_dict[key + '_cn_word_count'] = [count_chinese_characters(j) for j in predictions[i]]
             if judgements:
                 for i in range(len(judgements)):
                     key = 'judgement' if i == 0 else f'judgement{i + 1}'
                     pred_dict[key] = judgements[i]['model_preds']
                     for j in range(len(references)):
-                        references[j]['judge_model' +
-                                      str(i + 1)] = judgements[i]['model_name']
-
-        elif isinstance(
-                predictions[0][0], list
-        ):  #multi round for format like [[[{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}], [{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}]]]
+                        references[j]['judge_model' + str(i + 1)] = judgements[i]['model_name']
+        elif isinstance(predictions[0][0], list):
+            # multi round for format like [[[{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}], [{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}]]]
             if self.pack_all_predictions:
                 for i in range(len(predictions)):
                     key = 'prediction' if i == 0 else f'prediction{i + 1}'
@@ -189,58 +188,44 @@ class LMEvaluator:
                 raise NotImplementedError(
                     'Not applied meta-reivew judge on multi-round dataset')
         else:
-            raise NotImplementedError(
-                f'{predictions[0][0]} with type {type(predictions[0][0])}, please check the postprocess you add to the prediction string is right or not, we suggest to return an empty string but not None'
-            )
+            raise NotImplementedError(f'{predictions[0][0]} with type {type(predictions[0][0])}, please check the postprocess you add to the prediction string is right or not, we suggest to return an empty string but not None')
+
         if self.dataset_cfg:
             dataset = build_dataset_from_cfg(self.dataset_cfg)
 
             if infer_order == 'double':
-                new_ds = {
-                    k: dataset.test[k] * 2
-                    for k in dataset.test.column_names
-                }
+                new_ds = {k: dataset.test[k] * 2 for k in dataset.test.column_names}
                 dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
 
             if len(dup_indices) != 0:
-                remaining_indices = [
-                    idx for idx in range(len(dataset.test))
-                    if idx not in dup_indices
-                ]
-                dataset.reader.dataset['test'] = dataset.test.select(
-                    remaining_indices)
-                print(
-                    f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
-                )
+                remaining_indices = [idx for idx in range(len(dataset.test)) if idx not in dup_indices]
+                dataset.reader.dataset['test'] = dataset.test.select(remaining_indices)
+                print(f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!')
             for k, v in pred_dict.items():
                 dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
                 dataset.reader.input_columns.append(k)
 
             if references:
                 dataset.reader.input_columns.append('reference')
-                dataset.reader.dataset['test'] = dataset.test.add_column(
-                    'reference', references)
+                dataset.reader.dataset['test'] = dataset.test.add_column('reference', references)
         else:
             # build a default dataset just for comparison
             from opencompass.datasets.lmeval import LMEvalDataset
             input_columns = list(pred_dict.keys())
             if references:
                 input_columns.append('reference')
-            dataset = LMEvalDataset(reader_cfg=dict(
-                input_columns=input_columns,
-                output_column=None,
-                train_split='test'),
-                                    reference=references,
-                                    **pred_dict)
+            dataset = LMEvalDataset(
+                reader_cfg=dict(input_columns=input_columns, output_column=None, train_split='test'),
+                reference=references,
+                **pred_dict
+            )
         dataset.reader.output_column = 'reference'
         retriever = ZeroRetriever(dataset)
+
         if meta:
-            self.inferencer.inference(
-                retriever=retriever,
-                prompt_template=self.meta_review_prompt_tmpl)
+            self.inferencer.inference(retriever=retriever, prompt_template=self.meta_review_prompt_tmpl)
         else:
-            self.inferencer.inference(retriever=retriever,
-                                      prompt_template=self.prompt_tmpl)
+            self.inferencer.inference(retriever=retriever, prompt_template=self.prompt_tmpl)
 
         output = mmengine.load(self.output_path)
         return self.postprocess(output)
diff --git a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
index a71c41f8..17bdf468 100644
--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
@@ -1,8 +1,10 @@
 """Direct Generation Inferencer."""
 
 import inspect
+import json
 import os
 import os.path as osp
+import time
 from typing import List, Optional
 
 import mmengine
@@ -71,6 +73,7 @@ class GenInferencer(BaseInferencer):
         self.max_out_len = max_out_len
         self.min_out_len = min_out_len
         self.stopping_criteria = stopping_criteria
+        self.dump_timer = kwargs.get('dump_timer', False)
 
         if self.model.is_api and save_every is None:
             save_every = 1
@@ -128,6 +131,9 @@ class GenInferencer(BaseInferencer):
 
         # 5. Inference for prompts in each batch
         logger.info('Starting inference process...')
+
+        start_time_stamp = time.time()
+        num_sample = 0
         for datum in tqdm(dataloader, disable=not self.is_main_process):
             if ds_reader.output_column:
                 entry, golds = list(zip(*datum))
@@ -166,6 +172,9 @@ class GenInferencer(BaseInferencer):
                     and self.is_main_process):
                 output_handler.write_to_json(output_json_filepath,
                                              'tmp_' + output_json_filename)
+            num_sample += len(datum)
+
+        end_time_stamp = time.time()
 
         # 6. Output
         if self.is_main_process:
@@ -175,6 +184,18 @@ class GenInferencer(BaseInferencer):
             if osp.exists(tmp_json_filepath):
                 os.remove(tmp_json_filepath)
 
+        if self.dump_timer and self.is_main_process:
+            timer_filepath = os.path.join(output_json_filepath, 'timer',
+                                          'time.jsonl')
+            os.makedirs(os.path.dirname(timer_filepath), exist_ok=True)
+            time_dict = {
+                'dataset_name': output_json_filename.removesuffix('.json'),
+                'time': end_time_stamp - start_time_stamp,
+                'num_sample': num_sample
+            }
+            with open(timer_filepath, 'a') as f:
+                f.write(json.dumps(time_dict) + '\n')
+
         return [
             sample['prediction']
             for sample in output_handler.results_dict.values()
diff --git a/opencompass/partitioners/num_worker.py b/opencompass/partitioners/num_worker.py
index 58f35847..6fd58d86 100644
--- a/opencompass/partitioners/num_worker.py
+++ b/opencompass/partitioners/num_worker.py
@@ -29,12 +29,17 @@ class NumWorkerPartitioner(BasePartitioner):
     def __init__(self,
                  out_dir: str,
                  num_worker: int = 8,
+                 num_worker_split: Optional[int] = None,
                  min_task_size: int = 16,
                  strategy: str = 'heuristic',
                  dataset_size_path: str = '.cache/dataset_size.json',
                  keep_keys: Optional[List[str]] = None):
         super().__init__(out_dir=out_dir, keep_keys=keep_keys)
+        if strategy == 'split' and num_worker_split is not None:
+            self.logger.warning('num_worker_split is ignored with split.')
+
         self.num_worker = num_worker
+        self.num_worker_split = num_worker_split or num_worker
         self.min_task_size = min_task_size
         self.dataset_size_path = dataset_size_path
         assert strategy in ('heuristic', 'split'), \
@@ -72,9 +77,9 @@ class NumWorkerPartitioner(BasePartitioner):
                                 chunks.append(dataset_split)
 
                 if self.strategy == 'heuristic':
-                    buckets = [[] for _ in range(self.num_worker)]
+                    buckets = [[] for _ in range(self.num_worker_split)]
                     for i, chunk in enumerate(chunks):
-                        buckets[i % self.num_worker].append(chunk)
+                        buckets[i % self.num_worker_split].append(chunk)
 
                     for bucket in buckets:
                         if len(bucket) > 0:
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index e568a965..40453ed0 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -232,14 +232,18 @@ class DLCRunner(BaseRunner):
                 while True:
                     # 1. Avoid to request dlc too frequently.
                     # 2. DLC job may not be ready immediately after creation.
-                    for _ in range(20):
+                    num_retry = 60
+                    for retry_index in range(num_retry):
                         time.sleep(2)
                         try:
                             job_info = json.loads(
                                 subprocess.getoutput(f'dlc get job {job_id}'))
                             break
                         except:  # noqa: E722
-                            pass
+                            if retry_index > num_retry // 3:
+                                logger.warning(
+                                    f'Failed to get job info for {job_id}, '
+                                    'retrying...')
                     else:
                         raise RuntimeError(
                             f'Failed to get job info for {job_id}')
diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py
index 3be17e4e..297fba25 100644
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@@ -15,7 +15,7 @@ from mmengine.config import ConfigDict
 from tqdm import tqdm
 
 from opencompass.registry import RUNNERS, TASKS
-from opencompass.utils import get_logger
+from opencompass.utils import get_logger, model_abbr_from_cfg
 
 from .base import BaseRunner
 
@@ -110,11 +110,13 @@ class LocalRunner(BaseRunner):
                         if 'infer' in self.task_cfg.type.lower():
                             # If a model instance already exists,
                             # do not reload it.
-                            if hasattr(self, 'cur_model'):
-                                task.run(self.cur_model)
-                            else:
-                                task.run()
+                            task.run(cur_model=getattr(self, 'cur_model',
+                                                       None),
+                                     cur_model_abbr=getattr(
+                                         self, 'cur_model_abbr', None))
                             self.cur_model = task.model
+                            self.cur_model_abbr = model_abbr_from_cfg(
+                                task.model_cfg)
                         else:
                             task.run()
                     else:
diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py
index 8eeca328..7eef249d 100644
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -17,7 +17,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
 from opencompass.utils.prompt import get_prompt_hash
 
 METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match']
-METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
+METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len', 'type']
 
 def model_abbr_from_cfg_used_in_summarizer(model):
     if model.get('summarizer_abbr', None):
@@ -167,6 +167,8 @@ class DefaultSummarizer:
                     need_smart_metric = True
                     if sg.get('std', False):
                         default_metric = 'standard_deviation'
+                    elif sg.get('sum', False):
+                        default_metric = 'sum'
                     elif sg.get('weights', []):
                         default_metric = 'weighted_average'
                     else:
@@ -208,13 +210,16 @@ class DefaultSummarizer:
                             try:
                                 numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
                             except KeyError:
-                                tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()} for metric in scores}
+                                tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()}}
                                 numerator = sum(tmp_scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
                             denominator = sum(sg['weights'].values())
                         else:
                             numerator = sum(scores[metric].values())
                             denominator = len(scores[metric])
-                        scores[metric] = result[metric] = numerator / denominator
+                        if default_metric == 'sum':
+                            scores[metric] = result[metric] = numerator
+                        else:
+                            scores[metric] = result[metric] = numerator / denominator
                     eval_modes = list(set(eval_modes))
                     eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
 
diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py
index 157d1713..22e25004 100644
--- a/opencompass/summarizers/subjective/__init__.py
+++ b/opencompass/summarizers/subjective/__init__.py
@@ -5,6 +5,7 @@ from .alpacaeval import AlpacaSummarizer
 from .arenahard import ArenaHardSummarizer
 from .compass_arena import CompassArenaSummarizer
 from .compassbench import CompassBenchSummarizer
+from .compassbench_th import CompassBenchTHSummarizer
 from .corev2 import Corev2Summarizer
 from .creationbench import CreationBenchSummarizer
 from .flames import FlamesSummarizer
diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py
index 7a34c54a..f51bcb23 100644
--- a/opencompass/summarizers/subjective/compassbench.py
+++ b/opencompass/summarizers/subjective/compassbench.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 # yapf: disable
+import copy
 import os
 import os.path as osp
 import re
@@ -7,7 +8,7 @@ from collections import defaultdict
 from datetime import datetime
 from itertools import product
 
-import mmengine
+import pandas as pd
 from mmengine import ConfigDict
 from tabulate import tabulate
 
@@ -115,12 +116,14 @@ class CompassBenchSummarizer:
                     win_model2 = defaultdict(float)
                     categories = defaultdict(float)
                     difficulties = defaultdict(float)
+                    languages = defaultdict(float)
                     model1 = references[0]['answer1']
                     model2 = references[0]['answer2']
                     for prediction, reference in zip(judged_answers, references):
                         categories[dataset_abbr] += 1
                         categories[reference['category']] += 1
-                        difficulties[reference['level']] += 1
+                        difficulties['Level-' + str(reference['level'])] += 1
+                        languages['Lan-' + reference['lan']] += 1
 
                         if prediction == 'A':
                             if reference['answer1'] == model1:
@@ -140,8 +143,12 @@ class CompassBenchSummarizer:
 
                         win_model1[reference['category']] += score_1
                         win_model1[dataset_abbr] += score_1
+                        win_model1['Level-' + str(reference['level'])] += score_1
+                        win_model1['Lan-' + reference['lan']] += score_1
                         win_model2[reference['category']] += score_2
                         win_model2[dataset_abbr] += score_2
+                        win_model2['Level-' + str(reference['level'])] += score_2
+                        win_model2['Lan-' + reference['lan']] += score_2
                     for category in categories:
                         win_model1[category] = win_model1[category] / categories[category] * 100
                         win_model1[category] = round(win_model1[category], 2)
@@ -149,6 +156,12 @@ class CompassBenchSummarizer:
                         win_model2[category] = round(win_model2[category], 2)
                     win_model1['position_bias'] = bias_num
                     win_model2['position_bias'] = bias_num
+                    for difficulty in difficulties:
+                        win_model1[difficulty] = win_model1[difficulty] / difficulties[difficulty] * 100
+                        win_model2[difficulty] = win_model2[difficulty] / difficulties[difficulty] * 100
+                    for language in languages:
+                        win_model1[language] = win_model1[language] / languages[language] * 100
+                        win_model2[language] = win_model2[language] / languages[language] * 100
 
                     if judge_model not in scores:
                         scores[judge_model] = {}
@@ -156,7 +169,7 @@ class CompassBenchSummarizer:
                         scores[judge_model][dataset_abbr] = {}
                     scores[judge_model][dataset_abbr][model2] = win_model2
 
-        return scores
+        return scores, difficulties, languages
 
     def summarize(
             self,
@@ -172,10 +185,10 @@ class CompassBenchSummarizer:
         """
 
 
-        scores = self.get_score(time_str)
+        scores, difficulties, languages = self.get_score(time_str)
         # scores['win_' + model1] = win_model1
         output_dir, results_folder = get_outdir(self.cfg, time_str)
-
+        all_judge_file_list = []
 
         for idx, judge_model in enumerate(self.judge_models):
             judge_abbr = model_abbr_from_cfg(judge_model)
@@ -183,8 +196,13 @@ class CompassBenchSummarizer:
                 dataset_abbr = dataset_abbr_from_cfg(dataset)
                 summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
                 one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
-                row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
-                row_headers = [dataset_abbr, 'position_bias'] + row_headers
+                detail_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias'] and i not in difficulties and i not in languages]
+                row_headers = [dataset_abbr, 'position_bias']
+                for difficulty in difficulties:
+                    row_headers += [difficulty]
+                for language in languages:
+                    row_headers += [language]
+                row_headers += detail_headers
                 headers = [''] + summarizer_model_abbrs
                 table = []
                 for row_header in row_headers:
@@ -199,7 +217,7 @@ class CompassBenchSummarizer:
                         row.append(s)
                     table.append(row)
                 txt = tabulate(table, headers=headers)
-                print(txt)
+                # print(txt)
 
                 if idx == len(self.judge_models):
                     output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
@@ -211,31 +229,15 @@ class CompassBenchSummarizer:
                     for line in table:
                         f.write(','.join(line) + '\n')
                 print(output_filename)
+                all_judge_file_list.append(output_filename)
 
-            table = []
-            summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
-            headers = [''] + summarizer_model_abbrs
-            for dataset in self.cfg['datasets']:
-                dataset_abbr = dataset_abbr_from_cfg(dataset)
-                row = [dataset_abbr]
-                for model_cfg in self.compare_models:
-                    model_abbr = model_abbr_from_cfg(model_cfg)
-                    s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
-                    if isinstance(s, float):
-                        s = f'{s:.2f}'
-                    if isinstance(s, int):
-                        s = str(s)
-                    row.append(s)
-                table.append(row)
-            txt = tabulate(table, headers=headers)
-            print(txt)
+        dfs = [pd.read_csv(file) for file in all_judge_file_list]
 
-            if idx == len(self.judge_models):
-                output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
-            else:
-                output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
-            with open(output_filename, 'w') as f:
-                f.write(','.join(headers) + '\n')
-                for line in table:
-                    f.write(','.join(line) + '\n')
-            print(output_filename)
+        if len(dfs) > 1:
+            average_df = copy.deepcopy(dfs[0])
+            for col in dfs[0].columns[1:]:
+                for i in range(1, len(dfs[0])):
+                    average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
+            average_csv_path = osp.join(output_dir,  'Averaged-' + dataset_abbr + '-report.csv')
+            average_df.to_csv(average_csv_path, index=False)
+            print(average_csv_path)
diff --git a/opencompass/summarizers/subjective/compassbench_th.py b/opencompass/summarizers/subjective/compassbench_th.py
new file mode 100644
index 00000000..9ec6deaf
--- /dev/null
+++ b/opencompass/summarizers/subjective/compassbench_th.py
@@ -0,0 +1,353 @@
+# flake8: noqa
+# yapf: disable
+import copy
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+from itertools import product
+
+import mmengine
+import pandas as pd
+from mmengine import ConfigDict
+from tabulate import tabulate
+
+from opencompass.partitioners.sub_naive import remove_duplicate_pairs
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+
+
+def model_abbr_from_cfg_used_in_summarizer(model):
+    if model.get('summarizer_abbr', None):
+        return model['summarizer_abbr']
+    else:
+        return model_abbr_from_cfg(model)
+
+def post_process_compass_arena(s):
+    if result := re.findall(r'(?:选择：|Choice: )\[\[([ABC])\]\]', s):
+        return result[0]
+    else:
+        return None
+
+
+
+def get_outdir(cfg, time_str):
+    """Get out put path.
+
+    Args:
+        cfg (ConfigDict): The running config.
+        time_str (str): Current time.
+    """
+    work_dir = cfg['work_dir']
+    output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
+    output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
+    mmengine.mkdir_or_exist(output_dir)
+    results_folder = osp.join(work_dir, 'results')
+    return output_dir, results_folder
+
+
+def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
+    """Extract judgements (scores) and references.
+
+    Args:
+        dataset (ConfigDict): Dataset config.
+        subdir_path (str): Model path in results dir.
+        post_process (function): The pre-defined extract function.
+    """
+    dataset_abbr = dataset_abbr_from_cfg(dataset)
+    filename = osp.join(subdir_path, dataset_abbr + '.json')
+    partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
+    if osp.exists(osp.realpath(filename)):
+        result = mmengine.load(filename)
+    elif osp.exists(osp.realpath(partial_filename)):
+        filename = partial_filename
+        result = {}
+        i = 1
+        partial_dict_flag = 0
+        while osp.exists(osp.realpath(filename)):
+            res = mmengine.load(filename)
+            for k, v in res.items():
+                result[partial_dict_flag] = v
+                partial_dict_flag += 1
+            filename = osp.join(subdir_path,
+                                dataset_abbr + '_' + str(i) + '.json')
+            i += 1
+    else:
+        result = {}
+
+    if len(result) == 0:
+        print('*' * 100)
+        print('There are no results for ' + filename + ' or ' +
+              partial_filename)
+        print('*' * 100)
+        assert len(result) > 0
+
+    judged_answers = []
+    references = []
+    result_items = []
+    for k, v in result.items():
+        processed_judge = post_process(v['prediction'])
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            references.append(v['gold'])
+            result_items.append(v)
+        # else:
+        #     print(v['prediction'])
+        #     print('-' * 128)
+    if len(judged_answers) != len(result):
+        print(
+            f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
+        )
+    if len(judged_answers) == 0:
+        print('*' * 100)
+        print(
+            'There are no extracted judgements, please change your judge model or check your prompt!!!'
+        )
+        print('*' * 100)
+    assert len(judged_answers) > 0
+    return judged_answers, references, result_items
+
+
+
+def check_position_bias(judged_answers, references, banned_choice=['C']):
+    """Check position bias for judgellm's judgement.
+
+    Args:
+        judged_answers: The successfully extracted judgement.
+        references: The references contains original question, which is used to located the same question for different position judgement.
+    """
+    position_bias_flag = 0
+    position_bias_dict = {}
+    for judge, ref in zip(judged_answers, references):
+        question = ref['question']
+        question_hash = hash(question)
+        if question_hash not in position_bias_dict:
+            position_bias_dict[question_hash] = {
+                'question': question,
+                'judge': judge
+            }
+        else:
+            first_judge = position_bias_dict[question_hash]['judge']
+            if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
+                # If second choice is same with first choice, there has position bias.
+                position_bias_flag += 1
+    return position_bias_flag
+
+
+def count_chinese_characters(text):
+    words = re.findall(r'[\u4e00-\u9fff]', text)
+    return len(words)
+
+
+def count_english_words(text):
+    words = re.findall(r'\b[a-zA-Z]+\b', text)
+    return len(words)
+
+
+class CompassBenchTHSummarizer:
+    """Do the subjectivity analyze based on evaluation results.
+
+    Args:
+        config (ConfigDict): The configuration object of the evaluation task.
+            It's expected to be filled out at runtime.
+    """
+
+    def __init__(self,
+                 config: ConfigDict,
+                 judge_type='general',
+                 check_pos_bias=True,
+                 summary_type='single',
+                 word_count_threshold=None) -> None:
+        self.tasks = []
+        self.cfg = config
+        self.base_models = self.cfg['eval']['partitioner']['base_models']
+        self.compare_models = self.cfg['eval']['partitioner']['compare_models']
+        self.judge_models = self.cfg.get('judge_models', None)
+        self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
+        self.judge_type = judge_type
+        assert self.judge_type in ['general']
+        self.judge_map = {'general': post_process_compass_arena}
+        self.judge_function = self.judge_map[self.judge_type]
+        self.check_pos_bias = check_pos_bias
+        self.summary_type = summary_type
+        self.word_count_threshold = word_count_threshold
+
+    def get_score(self, time_str):
+        output_dir, results_folder = get_outdir(self.cfg, time_str)
+        model_combinations = list(product(self.base_models, self.compare_models))
+        unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
+
+        if self.meta_judge_model is not None:
+            self.judge_models.append(self.meta_judge_model)
+
+        scores = {}
+
+        for idx, judge_model_cfg in enumerate(self.judge_models):
+            judge_model = model_abbr_from_cfg(judge_model_cfg)
+            for dataset in self.cfg['datasets']:
+                dataset_abbr = dataset_abbr_from_cfg(dataset)
+                for model_pair in unique_combinations:
+                    model1 = model_pair[0]['abbr']
+                    model2 = model_pair[1]['abbr']
+                    if idx == len(self.judge_models):
+                        subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
+                    else:
+                        subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
+                    subdir_path = os.path.join(results_folder, subdir)
+                    if not os.path.isdir(subdir_path):
+                        print(subdir_path + ' is not exist! please check!')
+                        continue
+                    judged_answers, references, result_items = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
+                    if self.check_pos_bias:
+                        bias_num = check_position_bias(judged_answers, references)
+                    else:
+                        bias_num = 0
+                    win_model1 = defaultdict(float)
+                    win_model2 = defaultdict(float)
+                    categories = defaultdict(float)
+                    difficulties = defaultdict(float)
+                    languages = defaultdict(float)
+                    model1 = references[0]['answer1']
+                    model2 = references[0]['answer2']
+                    for prediction, reference, result_item in zip(judged_answers, references, result_items):
+                        categories[dataset_abbr] += 1
+                        categories[reference['category']] += 1
+                        difficulties['Level-' + str(reference['level'])] += 1
+                        languages['Lan-' + reference['lan']] += 1
+
+                        if prediction == 'A':
+                            if reference['answer1'] == model1:
+                                score_1, score_2 = 1, 0
+                            else:
+                                score_1, score_2 = 0, 1
+                        elif prediction == 'B':
+                            if reference['answer1'] == model1:
+                                score_1, score_2 = 0, 1
+                            else:
+                                score_1, score_2 = 1, 0
+                        elif prediction == 'C':
+                            if self.summary_type == 'half_add':
+                                score_1, score_2 = 0.5, 0.5
+                            else:
+                                score_1, score_2 = 0, 0
+
+                        # 进行分数修正
+                        if self.word_count_threshold is not None:
+                            try:
+                                if reference['lan'] == 'zh':
+                                    answer1 = re.search(r'\[回答1开始\](.*)\[回答1结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
+                                    answer2 = re.search(r'\[回答2开始\](.*)\[回答2结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
+                                else:
+                                    answer1 = re.search(r'\[Response 1 Start\](.*)\[Response 1 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
+                                    answer2 = re.search(r'\[Response 2 Start\](.*)\[Response 2 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
+                                word_count1 = count_chinese_characters(answer1) + count_english_words(answer1)
+                                word_count2 = count_chinese_characters(answer2) + count_english_words(answer2)
+                                if score_1 == 1 and score_2 == 0 and word_count1 - word_count2 > self.word_count_threshold:
+                                    score_1, score_2 = 0.5, 0.5
+                                elif score_1 == 0 and score_2 == 1 and word_count2 - word_count1 > self.word_count_threshold:
+                                    score_1, score_2 = 0.5, 0.5
+                            except Exception as e:
+                                print(e)
+                                from IPython import embed; embed(); exit()
+
+                        win_model1[reference['category']] += score_1
+                        win_model1[dataset_abbr] += score_1
+                        win_model1['Level-' + str(reference['level'])] += score_1
+                        win_model1['Lan-' + reference['lan']] += score_1
+                        win_model2[reference['category']] += score_2
+                        win_model2[dataset_abbr] += score_2
+                        win_model2['Level-' + str(reference['level'])] += score_2
+                        win_model2['Lan-' + reference['lan']] += score_2
+                    for category in categories:
+                        win_model1[category] = win_model1[category] / categories[category] * 100
+                        win_model1[category] = round(win_model1[category], 2)
+                        win_model2[category] = win_model2[category] / categories[category] * 100
+                        win_model2[category] = round(win_model2[category], 2)
+                    win_model1['position_bias'] = bias_num
+                    win_model2['position_bias'] = bias_num
+                    for difficulty in difficulties:
+                        win_model1[difficulty] = win_model1[difficulty] / difficulties[difficulty] * 100
+                        win_model2[difficulty] = win_model2[difficulty] / difficulties[difficulty] * 100
+                    for language in languages:
+                        win_model1[language] = win_model1[language] / languages[language] * 100
+                        win_model2[language] = win_model2[language] / languages[language] * 100
+
+                    if judge_model not in scores:
+                        scores[judge_model] = {}
+                    if dataset_abbr not in scores[judge_model]:
+                        scores[judge_model][dataset_abbr] = {}
+                    scores[judge_model][dataset_abbr][model2] = win_model2
+
+        return scores, difficulties, languages
+
+    def summarize(
+            self,
+            time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
+    ):
+        """Summarize the subjectivity analysis based on evaluation results.
+
+        Args:
+            time_str (str): Timestamp for file naming.
+
+        Returns:
+            pd.DataFrame: The summary results.
+        """
+
+
+        scores, difficulties, languages = self.get_score(time_str)
+        # scores['win_' + model1] = win_model1
+        output_dir, results_folder = get_outdir(self.cfg, time_str)
+        all_judge_file_list = []
+
+        for idx, judge_model in enumerate(self.judge_models):
+            judge_abbr = model_abbr_from_cfg(judge_model)
+            for dataset in self.cfg['datasets']:
+                dataset_abbr = dataset_abbr_from_cfg(dataset)
+                summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
+                one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
+                detail_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias'] and i not in difficulties and i not in languages]
+                row_headers = [dataset_abbr, 'position_bias']
+                for difficulty in difficulties:
+                    row_headers += [difficulty]
+                for language in languages:
+                    row_headers += [language]
+                row_headers += detail_headers
+                headers = [''] + summarizer_model_abbrs
+                table = []
+                for row_header in row_headers:
+                    row = [row_header]
+                    for model_cfg in self.compare_models:
+                        model_abbr = model_abbr_from_cfg(model_cfg)
+                        s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
+                        if isinstance(s, float):
+                            s = f'{s:.2f}'
+                        if isinstance(s, int):
+                            s = str(s)
+                        row.append(s)
+                    table.append(row)
+                txt = tabulate(table, headers=headers)
+                # print(txt)
+
+                if idx == len(self.judge_models):
+                    output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
+                else:
+                    output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
+
+                with open(output_filename, 'w') as f:
+                    f.write(','.join(headers) + '\n')
+                    for line in table:
+                        f.write(','.join(line) + '\n')
+                print(output_filename)
+                # print(output_filename)
+                all_judge_file_list.append(output_filename)
+
+
+        dfs = [pd.read_csv(file) for file in all_judge_file_list]
+
+        average_df = copy.deepcopy(dfs[0])
+        for col in dfs[0].columns[1:]:
+            for i in range(1, len(dfs[0])):
+                average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
+        average_csv_path = osp.join(output_dir,  'Averaged-' + dataset_abbr + '-report.csv')
+        average_df.to_csv(average_csv_path, index=False)
+        print(average_csv_path)
diff --git a/opencompass/summarizers/subjective/utils.py b/opencompass/summarizers/subjective/utils.py
index 1a2cf994..27d84660 100644
--- a/opencompass/summarizers/subjective/utils.py
+++ b/opencompass/summarizers/subjective/utils.py
@@ -64,6 +64,9 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
         if processed_judge is not None:
             judged_answers.append(processed_judge)
             references.append(v['gold'])
+        # else:
+        #     print(v['prediction'])
+        #     print('-' * 128)
     if len(judged_answers) != len(result):
         print(
             f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
diff --git a/opencompass/tasks/openicl_infer.py b/opencompass/tasks/openicl_infer.py
index fa47df33..a4a83190 100644
--- a/opencompass/tasks/openicl_infer.py
+++ b/opencompass/tasks/openicl_infer.py
@@ -14,7 +14,7 @@ from opencompass.registry import (ICL_INFERENCERS, ICL_PROMPT_TEMPLATES,
 from opencompass.tasks.base import BaseTask
 from opencompass.utils import (build_dataset_from_cfg, build_model_from_cfg,
                                get_infer_output_path, get_logger,
-                               task_abbr_from_cfg)
+                               model_abbr_from_cfg, task_abbr_from_cfg)
 
 
 @TASKS.register_module(force=(__name__ == '__main__'))  # A hack for script run
@@ -61,17 +61,16 @@ class OpenICLInferTask(BaseTask):
 
         return template.format(task_cmd=command)
 
-    def run(self, cur_model=None):
+    def run(self, cur_model=None, cur_model_abbr=None):
         self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}')
         for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
             self.max_out_len = model_cfg.get('max_out_len', None)
             self.batch_size = model_cfg.get('batch_size', None)
             self.min_out_len = model_cfg.get('min_out_len', None)
-            if cur_model:
+            if cur_model and cur_model_abbr == model_abbr_from_cfg(model_cfg):
                 self.model = cur_model
             else:
                 self.model = build_model_from_cfg(model_cfg)
-                cur_model = self.model
 
             for dataset_cfg in dataset_cfgs:
                 self.model_cfg = model_cfg
diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py
index a16d3141..a4817d5a 100644
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@@ -94,6 +94,15 @@ class SubjectiveEvalTask(BaseTask):
                 self._score(model_cfg, dataset_cfg, eval_cfg, output_column,
                             self.meta)
 
+    @property
+    def name(self) -> str:
+        task_name = task_abbr_from_cfg({
+            'models': self.model_cfgs,
+            'datasets': self.dataset_cfgs
+        })
+        return self.name_prefix + task_name + \
+            '--judge-by--' + model_abbr_from_cfg(self.judge_cfg)
+
     def _load_model_pred(
         self,
         model_cfg: Union[ConfigDict, List[ConfigDict]],
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index 7baddbb4..ab8ba05b 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -102,13 +102,12 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
         f'^选项\s?([{options}])',
         f'^([{options}])\s?选?项',
         f'(\s|^)[{options}][\s。，,：:\.$]',
-        f'(\s|^)[{options}](\s|$)',
         f'1.\s?(.*?)$',
         f'1.\s?([{options}])[.。$]?$',
     ]
     cushion_patterns = [
         f'([{options}]):',
-        f'[{options}]',
+        f'([{options}])',
     ]
     # flake8: noqa
     # yapf: enable
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 525f8a78..b81a5220 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -34,7 +34,7 @@ rouge
 -e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese
 rouge_score
 sacrebleu
-scikit_learn==1.2.1
+scikit_learn==1.5.0
 seaborn
 sentence_transformers==2.2.2
 tabulate
diff --git a/tools/convert_alignmentbench.py b/tools/convert_alignmentbench.py
index 1b759710..8f9699f9 100644
--- a/tools/convert_alignmentbench.py
+++ b/tools/convert_alignmentbench.py
@@ -77,7 +77,7 @@ def parse_args():
                         default='./data_release.jsonl',
                         help='The original jsonl path')
     parser.add_argument('--json',
-                        default='your prediction file path',
+                        default='./alignment_bench.json',
                         help='The results json path')
     parser.add_argument('--exp-folder', help='The results json name')
     args = parser.parse_args()
@@ -89,6 +89,6 @@ if __name__ == '__main__':
     mode = args.mode
     if mode == 'json':
         processed_data = process_jsonl(args.jsonl)
-        save_as_json(processed_data)
+        save_as_json(processed_data, args.json)
     elif mode == 'csv':
         extract_predictions_from_json(args.exp_folder)
diff --git a/tools/prediction_merger.py b/tools/prediction_merger.py
index cefef821..2614686d 100644
--- a/tools/prediction_merger.py
+++ b/tools/prediction_merger.py
@@ -16,6 +16,7 @@ def parse_args():
     parser.add_argument('-w', '--work-dir', default=None, type=str)
     parser.add_argument('-r', '--reuse', default='latest', type=str)
     parser.add_argument('-c', '--clean', action='store_true')
+    parser.add_argument('-f', '--force', action='store_true')
     args = parser.parse_args()
     return args
 
@@ -35,7 +36,8 @@ class PredictionMerger:
         root, ext = os.path.splitext(filename)
         partial_filename = root + '_0' + ext
 
-        if os.path.exists(os.path.realpath(filename)):
+        if os.path.exists(
+                os.path.realpath(filename)) and not self.cfg['force']:
             return
 
         if not os.path.exists(os.path.realpath(partial_filename)):
@@ -77,7 +79,8 @@ def dispatch_tasks(cfg):
                 'model': model,
                 'dataset': dataset,
                 'work_dir': cfg['work_dir'],
-                'clean': cfg['clean']
+                'clean': cfg['clean'],
+                'force': cfg['force'],
             }).run()
 
 
@@ -104,6 +107,7 @@ def main():
     cfg['work_dir'] = os.path.join(cfg.work_dir, dir_time_str)
 
     cfg['clean'] = args.clean
+    cfg['force'] = args.force
 
     dispatch_tasks(cfg)